summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig143
-rw-r--r--net/netfilter/Makefile46
-rw-r--r--net/netfilter/core.c259
-rw-r--r--net/netfilter/ipset/Kconfig2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h18
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c13
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c2
-rw-r--r--net/netfilter/ipset/ip_set_core.c414
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h214
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c45
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c24
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c39
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c17
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c17
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c6
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c17
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c36
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c40
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c17
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c34
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c51
-rw-r--r--net/netfilter/ipvs/Kconfig45
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c240
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c425
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c674
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c887
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c18
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c15
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c67
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c150
-rw-r--r--net/netfilter/nf_bpf_link.c332
-rw-r--r--net/netfilter/nf_conncount.c245
-rw-r--r--net/netfilter/nf_conntrack_acct.c21
-rw-r--r--net/netfilter/nf_conntrack_amanda.c2
-rw-r--r--net/netfilter/nf_conntrack_bpf.c550
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c9
-rw-r--r--net/netfilter/nf_conntrack_core.c1338
-rw-r--r--net/netfilter/nf_conntrack_ecache.c416
-rw-r--r--net/netfilter/nf_conntrack_expect.c69
-rw-r--r--net/netfilter/nf_conntrack_extend.c148
-rw-r--r--net/netfilter/nf_conntrack_ftp.c19
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c10
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c279
-rw-r--r--net/netfilter/nf_conntrack_helper.c113
-rw-r--r--net/netfilter/nf_conntrack_irc.c56
-rw-r--r--net/netfilter/nf_conntrack_labels.c35
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c5
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1113
-rw-r--r--net/netfilter/nf_conntrack_ovs.c185
-rw-r--r--net/netfilter/nf_conntrack_pptp.c66
-rw-r--r--net/netfilter/nf_conntrack_proto.c191
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c770
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c25
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c7
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c60
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c288
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c655
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c49
-rw-r--r--net/netfilter/nf_conntrack_sane.c65
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c16
-rw-r--r--net/netfilter/nf_conntrack_sip.c19
-rw-r--r--net/netfilter/nf_conntrack_standalone.c412
-rw-r--r--net/netfilter/nf_conntrack_timeout.c71
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c20
-rw-r--r--net/netfilter/nf_dup_netdev.c41
-rw-r--r--net/netfilter/nf_flow_table_bpf.c121
-rw-r--r--net/netfilter/nf_flow_table_core.c602
-rw-r--r--net/netfilter/nf_flow_table_inet.c50
-rw-r--r--net/netfilter/nf_flow_table_ip.c891
-rw-r--r--net/netfilter/nf_flow_table_offload.c393
-rw-r--r--net/netfilter/nf_flow_table_path.c330
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_flow_table_xdp.c147
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c123
-rw-r--r--net/netfilter/nf_internals.h6
-rw-r--r--net/netfilter/nf_log.c70
-rw-r--r--net/netfilter/nf_log_common.c212
-rw-r--r--net/netfilter/nf_log_netdev.c78
-rw-r--r--net/netfilter/nf_log_syslog.c1085
-rw-r--r--net/netfilter/nf_nat_amanda.c14
-rw-r--r--net/netfilter/nf_nat_bpf.c77
-rw-r--r--net/netfilter/nf_nat_core.c387
-rw-r--r--net/netfilter/nf_nat_ftp.c17
-rw-r--r--net/netfilter/nf_nat_helper.c31
-rw-r--r--net/netfilter/nf_nat_irc.c16
-rw-r--r--net/netfilter/nf_nat_masquerade.c173
-rw-r--r--net/netfilter/nf_nat_ovs.c136
-rw-r--r--net/netfilter/nf_nat_proto.c175
-rw-r--r--net/netfilter/nf_nat_redirect.c98
-rw-r--r--net/netfilter/nf_nat_sip.c14
-rw-r--r--net/netfilter/nf_queue.c191
-rw-r--r--net/netfilter/nf_sockopt.c60
-rw-r--r--net/netfilter/nf_synproxy_core.c53
-rw-r--r--net/netfilter/nf_tables_api.c8094
-rw-r--r--net/netfilter/nf_tables_core.c207
-rw-r--r--net/netfilter/nf_tables_offload.c259
-rw-r--r--net/netfilter/nf_tables_trace.c184
-rw-r--r--net/netfilter/nfnetlink.c288
-rw-r--r--net/netfilter/nfnetlink_acct.c135
-rw-r--r--net/netfilter/nfnetlink_cthelper.c98
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c308
-rw-r--r--net/netfilter/nfnetlink_hook.c486
-rw-r--r--net/netfilter/nfnetlink_log.c129
-rw-r--r--net/netfilter/nfnetlink_osf.c37
-rw-r--r--net/netfilter/nfnetlink_queue.c445
-rw-r--r--net/netfilter/nft_bitwise.c472
-rw-r--r--net/netfilter/nft_byteorder.c56
-rw-r--r--net/netfilter/nft_chain_filter.c198
-rw-r--r--net/netfilter/nft_chain_nat.c5
-rw-r--r--net/netfilter/nft_chain_route.c8
-rw-r--r--net/netfilter/nft_cmp.c234
-rw-r--r--net/netfilter/nft_compat.c214
-rw-r--r--net/netfilter/nft_connlimit.c94
-rw-r--r--net/netfilter/nft_counter.c171
-rw-r--r--net/netfilter/nft_ct.c207
-rw-r--r--net/netfilter/nft_ct_fast.c62
-rw-r--r--net/netfilter/nft_dup_netdev.c17
-rw-r--r--net/netfilter/nft_dynset.c273
-rw-r--r--net/netfilter/nft_exthdr.c397
-rw-r--r--net/netfilter/nft_fib.c82
-rw-r--r--net/netfilter/nft_fib_inet.c2
-rw-r--r--net/netfilter/nft_fib_netdev.c2
-rw-r--r--net/netfilter/nft_flow_offload.c103
-rw-r--r--net/netfilter/nft_fwd_netdev.c46
-rw-r--r--net/netfilter/nft_hash.c71
-rw-r--r--net/netfilter/nft_immediate.c190
-rw-r--r--net/netfilter/nft_inner.c431
-rw-r--r--net/netfilter/nft_last.c138
-rw-r--r--net/netfilter/nft_limit.c220
-rw-r--r--net/netfilter/nft_log.c29
-rw-r--r--net/netfilter/nft_lookup.c192
-rw-r--r--net/netfilter/nft_masq.c113
-rw-r--r--net/netfilter/nft_meta.c186
-rw-r--r--net/netfilter/nft_nat.c66
-rw-r--r--net/netfilter/nft_numgen.c86
-rw-r--r--net/netfilter/nft_objref.c112
-rw-r--r--net/netfilter/nft_osf.c78
-rw-r--r--net/netfilter/nft_payload.c545
-rw-r--r--net/netfilter/nft_queue.c47
-rw-r--r--net/netfilter/nft_quota.c83
-rw-r--r--net/netfilter/nft_range.c39
-rw-r--r--net/netfilter/nft_redir.c124
-rw-r--r--net/netfilter/nft_reject.c24
-rw-r--r--net/netfilter/nft_reject_inet.c75
-rw-r--r--net/netfilter/nft_reject_netdev.c190
-rw-r--r--net/netfilter/nft_rt.c24
-rw-r--r--net/netfilter/nft_set_bitmap.c89
-rw-r--r--net/netfilter/nft_set_hash.c443
-rw-r--r--net/netfilter/nft_set_pipapo.c1001
-rw-r--r--net/netfilter/nft_set_pipapo.h88
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c241
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h6
-rw-r--r--net/netfilter/nft_set_rbtree.c666
-rw-r--r--net/netfilter/nft_socket.c215
-rw-r--r--net/netfilter/nft_synproxy.c23
-rw-r--r--net/netfilter/nft_tproxy.c74
-rw-r--r--net/netfilter/nft_tunnel.c114
-rw-r--r--net/netfilter/nft_xfrm.c60
-rw-r--r--net/netfilter/utils.c87
-rw-r--r--net/netfilter/x_tables.c277
-rw-r--r--net/netfilter/xt_AUDIT.c2
-rw-r--r--net/netfilter/xt_CHECKSUM.c33
-rw-r--r--net/netfilter/xt_CLASSIFY.c16
-rw-r--r--net/netfilter/xt_CONNSECMARK.c38
-rw-r--r--net/netfilter/xt_CT.c144
-rw-r--r--net/netfilter/xt_DSCP.c8
-rw-r--r--net/netfilter/xt_HMARK.c2
-rw-r--r--net/netfilter/xt_IDLETIMER.c137
-rw-r--r--net/netfilter/xt_LED.c53
-rw-r--r--net/netfilter/xt_LOG.c11
-rw-r--r--net/netfilter/xt_NFLOG.c47
-rw-r--r--net/netfilter/xt_RATEEST.c51
-rw-r--r--net/netfilter/xt_REDIRECT.c10
-rw-r--r--net/netfilter/xt_SECMARK.c111
-rw-r--r--net/netfilter/xt_TCPMSS.c4
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c4
-rw-r--r--net/netfilter/xt_TPROXY.c38
-rw-r--r--net/netfilter/xt_TRACE.c37
-rw-r--r--net/netfilter/xt_addrtype.c15
-rw-r--r--net/netfilter/xt_bpf.c2
-rw-r--r--net/netfilter/xt_cgroup.c26
-rw-r--r--net/netfilter/xt_cluster.c33
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlimit.c74
-rw-r--r--net/netfilter/xt_connmark.c48
-rw-r--r--net/netfilter/xt_hashlimit.c36
-rw-r--r--net/netfilter/xt_length.c5
-rw-r--r--net/netfilter/xt_limit.c52
-rw-r--r--net/netfilter/xt_mark.c42
-rw-r--r--net/netfilter/xt_nat.c1
-rw-r--r--net/netfilter/xt_nfacct.c8
-rw-r--r--net/netfilter/xt_osf.c1
-rw-r--r--net/netfilter/xt_owner.c16
-rw-r--r--net/netfilter/xt_physdev.c2
-rw-r--r--net/netfilter/xt_recent.c28
-rw-r--r--net/netfilter/xt_repldata.h2
-rw-r--r--net/netfilter/xt_sctp.c3
-rw-r--r--net/netfilter/xt_socket.c20
-rw-r--r--net/netfilter/xt_statistic.c2
-rw-r--r--net/netfilter/xt_tcpudp.c110
-rw-r--r--net/netfilter/xt_time.c2
-rw-r--r--net/netfilter/xt_u32.c21
228 files changed, 26762 insertions, 11875 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 0ffe2b8723c4..6cdc994fdc8a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
menu "Core Netfilter Configuration"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
config NETFILTER_INGRESS
bool "Netfilter ingress support"
@@ -10,6 +10,17 @@ config NETFILTER_INGRESS
This allows you to classify packets from ingress using the Netfilter
infrastructure.
+config NETFILTER_EGRESS
+ bool "Netfilter egress support"
+ default y
+ select NET_EGRESS
+ help
+ This allows you to classify packets before transmission using the
+ Netfilter infrastructure.
+
+config NETFILTER_SKIP_EGRESS
+ def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB)
+
config NETFILTER_NETLINK
tristate
@@ -19,6 +30,19 @@ config NETFILTER_FAMILY_BRIDGE
config NETFILTER_FAMILY_ARP
bool
+config NETFILTER_BPF_LINK
+ def_bool BPF_SYSCALL
+
+config NETFILTER_NETLINK_HOOK
+ tristate "Netfilter base hook dump support"
+ depends on NETFILTER_ADVANCED
+ depends on NF_TABLES
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ to list the base netfilter hooks via NFNETLINK.
+ This is helpful for debugging.
+
config NETFILTER_NETLINK_ACCT
tristate "Netfilter NFACCT over NFNETLINK interface"
depends on NETFILTER_ADVANCED
@@ -71,12 +95,17 @@ config NF_CONNTRACK
To compile it as a module, choose M here. If unsure, say N.
-config NF_LOG_COMMON
- tristate
-
-config NF_LOG_NETDEV
- tristate "Netdev packet logging"
- select NF_LOG_COMMON
+config NF_LOG_SYSLOG
+ tristate "Syslog packet logging"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enable support for packet logging via syslog.
+ It supports IPv4, IPV6, ARP and common transport protocols such
+ as TCP and UDP.
+ This is a simpler but less flexible logging method compared to
+ CONFIG_NETFILTER_NETLINK_LOG.
+ If both are enabled the backend to use can be configured at run-time
+ by means of per-address-family sysctl tunables.
if NF_CONNTRACK
config NETFILTER_CONNCOUNT
@@ -94,7 +123,7 @@ config NF_CONNTRACK_MARK
config NF_CONNTRACK_SECMARK
bool 'Connection tracking security mark support'
depends on NETWORK_SECMARK
- default m if NETFILTER_ADVANCED=n
+ default y if NETFILTER_ADVANCED=n
help
This option enables security markings to be applied to
connections. Typically they are copied to connections from
@@ -118,7 +147,6 @@ config NF_CONNTRACK_ZONES
config NF_CONNTRACK_PROCFS
bool "Supply CT list in procfs (OBSOLETE)"
- default y
depends on PROC_FS
help
This option enables for the list of known conntrack entries
@@ -164,15 +192,8 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It can be used with xtables connlabel
match and the nftables ct expression.
-config NF_CT_PROTO_DCCP
- bool 'DCCP protocol connection tracking support'
- depends on NETFILTER_ADVANCED
- default y
- help
- With this option enabled, the layer 3 independent connection
- tracking code will be able to do state tracking on DCCP connections.
-
- If unsure, say Y.
+config NF_CONNTRACK_OVS
+ bool
config NF_CT_PROTO_GRE
bool
@@ -181,7 +202,7 @@ config NF_CT_PROTO_SCTP
bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default y
- select LIBCRC32C
+ select NET_CRC32C
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
@@ -434,6 +455,9 @@ config NF_NAT_REDIRECT
config NF_NAT_MASQUERADE
bool
+config NF_NAT_OVS
+ bool
+
config NETFILTER_SYNPROXY
tristate
@@ -441,13 +465,14 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
+ select NET_CRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
provides a pseudo-state machine with an extensible instruction-set
(also known as expressions) that the userspace 'nft' utility
- (http://www.netfilter.org/projects/nftables) uses to build the
+ (https://www.netfilter.org/projects/nftables) uses to build the
rule-set. It also comes with the generic set infrastructure that
allows you to construct mappings between matchings and actions
for performance lookups.
@@ -481,6 +506,12 @@ config NFT_CT
This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
+config NFT_EXTHDR_DCCP
+ bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)"
+ default n
+ help
+ This option adds support for matching on DCCP extension headers.
+
config NFT_FLOW_OFFLOAD
depends on NF_CONNTRACK && NF_FLOW_TABLE
tristate "Netfilter nf_tables hardware flow offload module"
@@ -488,12 +519,6 @@ config NFT_FLOW_OFFLOAD
This option adds the "flow_offload" expression that you can use to
choose what flows are placed into the hardware.
-config NFT_COUNTER
- tristate "Netfilter nf_tables counter module"
- help
- This option adds the "counter" expression that you can use to
- include packet and byte counters in a rule.
-
config NFT_CONNLIMIT
tristate "Netfilter nf_tables connlimit module"
depends on NF_CONNTRACK
@@ -548,12 +573,6 @@ config NFT_TUNNEL
This option adds the "tunnel" expression that you can use to set
tunneling policies.
-config NFT_OBJREF
- tristate "Netfilter nf_tables stateful object reference module"
- help
- This option adds the "objref" expression that allows you to refer to
- stateful objects, such as counters and quotas.
-
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
@@ -681,6 +700,16 @@ config NFT_FIB_NETDEV
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_REJECT_NETDEV
+ depends on NFT_REJECT_IPV4
+ depends on NFT_REJECT_IPV6
+ tristate "Netfilter nf_tables netdev REJECT support"
+ help
+ This option enables the REJECT support from the netdev table.
+ The return packet generation will be delegated to the IPv4
+ or IPv6 ICMP or TCP RST implementation depending on the
+ protocol of the packet.
+
endif # NF_TABLES_NETDEV
endif # NF_TABLES
@@ -703,6 +732,14 @@ config NF_FLOW_TABLE
To compile it as a module, choose M here.
+config NF_FLOW_TABLE_PROCFS
+ bool "Supply flow table statistics in procfs"
+ depends on NF_FLOW_TABLE
+ depends on PROC_FS
+ help
+ This option enables for the flow table offload statistics
+ to be shown in procfs under net/netfilter/nf_flowtable.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
@@ -712,6 +749,25 @@ config NETFILTER_XTABLES
if NETFILTER_XTABLES
+config NETFILTER_XTABLES_COMPAT
+ bool "Netfilter Xtables 32bit support"
+ depends on COMPAT
+ help
+ This option provides a translation layer to run 32bit arp,ip(6),ebtables
+ binaries on 64bit kernels.
+
+ If unsure, say N.
+
+config NETFILTER_XTABLES_LEGACY
+ bool "Netfilter legacy tables support"
+ depends on !PREEMPT_RT
+ help
+ Say Y here if you still require support for legacy tables. This is
+ required by the legacy tools (iptables-legacy) and is not needed if
+ you use iptables over nftables (iptables-nft).
+ Legacy support is not limited to IP, it also includes EBTABLES and
+ ARPTABLES.
+
comment "Xtables combined modules"
config NETFILTER_XT_MARK
@@ -768,7 +824,7 @@ config NETFILTER_XT_TARGET_AUDIT
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a `CHECKSUM' target, which can be used in the iptables mangle
@@ -790,7 +846,7 @@ config NETFILTER_XT_TARGET_CLASSIFY
the priority of a packet. Some qdiscs can use this value for
classification, among these are:
- atm, cbq, dsmark, pfifo_fast, htb, prio
+ atm, cbq, dsmark, pfifo_fast, htb, prio
To compile it as a module, choose M here. If unsure, say N.
@@ -819,7 +875,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK
config NETFILTER_XT_TARGET_CT
tristate '"CT" target support'
depends on NF_CONNTRACK
- depends on IP_NF_RAW || IP6_NF_RAW
+ depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This options adds a `CT' target, which allows to specify initial
@@ -830,7 +886,7 @@ config NETFILTER_XT_TARGET_CT
config NETFILTER_XT_TARGET_DSCP
tristate '"DSCP" and "TOS" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a `DSCP' target, which allows you to manipulate
@@ -846,7 +902,7 @@ config NETFILTER_XT_TARGET_DSCP
config NETFILTER_XT_TARGET_HL
tristate '"HL" hoplimit target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
@@ -911,8 +967,7 @@ config NETFILTER_XT_TARGET_LED
config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
- select NF_LOG_COMMON
- select NF_LOG_IPV4
+ select NF_LOG_SYSLOG
select NF_LOG_IPV6 if IP6_NF_IPTABLES
default m if NETFILTER_ADVANCED=n
help
@@ -1031,7 +1086,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
- depends on IP_NF_MANGLE
+ depends on IP_NF_MANGLE || NFT_COMPAT
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
select NF_TPROXY_IPV4
@@ -1098,7 +1153,7 @@ config NETFILTER_XT_TARGET_TCPMSS
config NETFILTER_XT_TARGET_TCPOPTSTRIP
tristate '"TCPOPTSTRIP" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a "TCPOPTSTRIP" target, which allows you to strip
@@ -1131,7 +1186,7 @@ config NETFILTER_XT_MATCH_CGROUP
tristate '"control group" match support'
depends on NETFILTER_ADVANCED
depends on CGROUPS
- select CGROUP_NET_CLASSID
+ select SOCK_CGROUP_DATA
help
Socket/process control group matching allows you to match locally
generated packets based on which net_cls control group processes
@@ -1229,9 +1284,9 @@ config NETFILTER_XT_MATCH_CPU
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_MATCH_DCCP
- tristate '"dccp" protocol match support'
+ tristate '"dccp" protocol match support (DEPRECATED)'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default n
help
With this option enabled, you will be able to use the iptables
`dccp' match in order to match on DCCP source/destination ports
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0e0ded87e27b..6bfc250e474f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -11,17 +11,24 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
+obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
@@ -48,15 +55,18 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o
-# generic transport layer logging
-obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
-
-# packet logging for netdev family
-obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
+obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o
+
+ifeq ($(CONFIG_NF_NAT),m)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
+else ifeq ($(CONFIG_NF_NAT),y)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o
+endif
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -77,7 +87,8 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
+ nft_counter.o nft_objref.o nft_inner.o \
nft_chain_route.o nf_tables_offload.o \
nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
nft_set_pipapo.o
@@ -88,6 +99,12 @@ nf_tables-objs += nft_set_pipapo_avx2.o
endif
endif
+ifdef CONFIG_NFT_CT
+ifdef CONFIG_MITIGATION_RETPOLINE
+nf_tables-objs += nft_ct_fast.o
+endif
+endif
+
obj-$(CONFIG_NF_TABLES) += nf_tables.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
@@ -96,13 +113,12 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
-obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o
obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
-obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
@@ -125,7 +141,14 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
- nf_flow_table_offload.o
+ nf_flow_table_path.o \
+ nf_flow_table_offload.o nf_flow_table_xdp.o
+nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
+ifeq ($(CONFIG_NF_FLOW_TABLE),m)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o
+else ifeq ($(CONFIG_NF_FLOW_TABLE),y)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o
+endif
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
@@ -214,3 +237,6 @@ obj-$(CONFIG_IP_SET) += ipset/
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
+
+# lwtunnel
+obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 3ac7c8c1548d..11a702065bab 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -31,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
-DEFINE_PER_CPU(bool, nf_skb_duplicated);
-EXPORT_SYMBOL_GPL(nf_skb_duplicated);
-
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -58,7 +55,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
if (num == 0)
return NULL;
- e = kvzalloc(alloc, GFP_KERNEL);
+ e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
if (e)
e->num_hook_entries = num;
return e;
@@ -119,6 +116,18 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
for (i = 0; i < old_entries; i++) {
if (orig_ops[i] != &dummy_ops)
alloc_entries++;
+
+ /* Restrict BPF hook type to force a unique priority, not
+ * shared at attach time.
+ *
+ * This is mainly to avoid ordering issues between two
+ * different bpf programs, this doesn't prevent a normal
+ * hook at same priority as a bpf one (we don't want to
+ * prevent defrag, conntrack, iptables etc from attaching).
+ */
+ if (reg->priority == orig_ops[i]->priority &&
+ reg->hook_ops_type == NF_HOOK_OP_BPF)
+ return ERR_PTR(-EBUSY);
}
}
@@ -282,6 +291,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return NULL;
return net->nf.hooks_bridge + hooknum;
#endif
+#ifdef CONFIG_NETFILTER_INGRESS
+ case NFPROTO_INET:
+ if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
+ return NULL;
+ if (!dev || dev_net(dev) != net) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+ return &dev->nf_hooks_ingress;
+#endif
case NFPROTO_IPV4:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
return NULL;
@@ -290,12 +309,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
return NULL;
return net->nf.hooks_ipv6 + hooknum;
-#if IS_ENABLED(CONFIG_DECNET)
- case NFPROTO_DECNET:
- if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
- return NULL;
- return net->nf.hooks_decnet + hooknum;
-#endif
default:
WARN_ON_ONCE(1);
return NULL;
@@ -307,24 +320,106 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return &dev->nf_hooks_ingress;
}
#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hooknum == NF_NETDEV_EGRESS) {
+ if (dev && dev_net(dev) == net)
+ return &dev->nf_hooks_egress;
+ }
+#endif
WARN_ON_ONCE(1);
return NULL;
}
+static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
+ int hooknum)
+{
+#ifndef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == hooknum)
+ return -EOPNOTSUPP;
+#endif
+ if (reg->hooknum != hooknum ||
+ !reg->dev || dev_net(reg->dev) != net)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
+ (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
+ return true;
+
+ return false;
+}
+
+static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
+}
+
+static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
+static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
+ int err;
- if (pf == NFPROTO_NETDEV) {
+ switch (pf) {
+ case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
if (reg->hooknum == NF_NETDEV_INGRESS)
return -EOPNOTSUPP;
#endif
- if (reg->hooknum != NF_NETDEV_INGRESS ||
+#ifndef CONFIG_NETFILTER_EGRESS
+ if (reg->hooknum == NF_NETDEV_EGRESS)
+ return -EOPNOTSUPP;
+#endif
+ if ((reg->hooknum != NF_NETDEV_INGRESS &&
+ reg->hooknum != NF_NETDEV_EGRESS) ||
!reg->dev || dev_net(reg->dev) != net)
return -EINVAL;
+ break;
+ case NFPROTO_INET:
+ if (reg->hooknum != NF_INET_INGRESS)
+ break;
+
+ err = nf_ingress_check(net, reg, NF_INET_INGRESS);
+ if (err < 0)
+ return err;
+ break;
}
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
@@ -336,21 +431,25 @@ static int __nf_register_net_hook(struct net *net, int pf,
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
- if (!IS_ERR(new_hooks))
+ if (!IS_ERR(new_hooks)) {
+ hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
+ }
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
- hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_inc_egress_queue();
#endif
+ nf_static_key_inc(reg, pf);
+
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
@@ -403,12 +502,14 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_dec_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_dec_egress_queue();
#endif
+ nf_static_key_dec(reg, pf);
} else {
WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
}
@@ -425,8 +526,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
if (reg->pf == NFPROTO_INET) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ if (reg->hooknum == NF_INET_INGRESS) {
+ __nf_unregister_net_hook(net, NFPROTO_INET, reg);
+ } else {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ }
} else {
__nf_unregister_net_hook(net, reg->pf, reg);
}
@@ -451,14 +556,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
int err;
if (reg->pf == NFPROTO_INET) {
- err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
- if (err < 0)
- return err;
-
- err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
- if (err < 0) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- return err;
+ if (reg->hooknum == NF_INET_INGRESS) {
+ err = __nf_register_net_hook(net, NFPROTO_INET, reg);
+ if (err < 0)
+ return err;
+ } else {
+ err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
+ if (err < 0)
+ return err;
+
+ err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
+ if (err < 0) {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ return err;
+ }
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
@@ -514,7 +625,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
case NF_ACCEPT:
break;
case NF_DROP:
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
@@ -524,10 +636,10 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
if (ret == 1)
continue;
return ret;
+ case NF_STOLEN:
+ return NF_DROP_GETERR(verdict);
default:
- /* Implicit handling for NF_STOLEN, as well as any other
- * non conventional verdicts.
- */
+ WARN_ON_ONCE(1);
return 0;
}
}
@@ -540,11 +652,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
const struct nf_hook_entries *e)
{
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
int ret;
- INIT_LIST_HEAD(&sublist);
-
list_for_each_entry_safe(skb, next, head, list) {
skb_list_del_init(skb);
ret = nf_hook_slow(skb, state, e, 0);
@@ -559,32 +669,38 @@ EXPORT_SYMBOL(nf_hook_slow_list);
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
-struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
-struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
+const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v4_hook);
+
+const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v6_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
- __rcu __read_mostly;
-EXPORT_SYMBOL(ip_ct_attach);
+u8 nf_ctnetlink_has_listener;
+EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener);
-struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
+/* This does not belong here, but locally generated errors need it if connection
+ * tracking in use: without this, connection may not be in hash table, and hence
+ * manufactured ICMP or RST packets will not be associated with it.
+ */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, const struct sk_buff *);
+ const struct nf_ct_hook *ct_hook;
if (skb->_nfct) {
rcu_read_lock();
- attach = rcu_dereference(ip_ct_attach);
- if (attach)
- attach(new, skb);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->attach(new, skb);
rcu_read_unlock();
}
}
@@ -592,20 +708,38 @@ EXPORT_SYMBOL(nf_ct_attach);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- BUG_ON(ct_hook == NULL);
- ct_hook->destroy(nfct);
+ if (ct_hook)
+ ct_hook->destroy(nfct);
rcu_read_unlock();
+
+ WARN_ON(!ct_hook);
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+void nf_ct_set_closing(struct nf_conntrack *nfct)
+{
+ const struct nf_ct_hook *ct_hook;
+
+ if (!nfct)
+ return;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->set_closing(nfct);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_set_closing);
+
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
bool ret = false;
rcu_read_lock();
@@ -644,10 +778,6 @@ static int __net_init netfilter_net_init(struct net *net)
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
-#if IS_ENABLED(CONFIG_DECNET)
- __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
-#endif
-
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
@@ -680,12 +810,21 @@ int __init netfilter_init(void)
if (ret < 0)
goto err;
+#ifdef CONFIG_LWTUNNEL
+ ret = netfilter_lwtunnel_init();
+ if (ret < 0)
+ goto err_lwtunnel_pernet;
+#endif
ret = netfilter_log_init();
if (ret < 0)
- goto err_pernet;
+ goto err_log_pernet;
return 0;
-err_pernet:
+err_log_pernet:
+#ifdef CONFIG_LWTUNNEL
+ netfilter_lwtunnel_fini();
+err_lwtunnel_pernet:
+#endif
unregister_pernet_subsys(&netfilter_net_ops);
err:
return ret;
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 3c273483df23..b1ea054bb82c 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -30,7 +30,7 @@ config IP_SET_BITMAP_IP
depends on IP_SET
help
This option adds the bitmap:ip set type support, by which one
- can store IPv4 addresses (or network addresse) from a range.
+ can store IPv4 addresses (or network addresses) from a range.
To compile it as a module, choose M here. If unsure, say N.
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..798c7993635e 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -4,6 +4,8 @@
#ifndef __IP_SET_BITMAP_IP_GEN_H
#define __IP_SET_BITMAP_IP_GEN_H
+#include <linux/rcupdate_wait.h>
+
#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
@@ -28,6 +30,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
@@ -57,9 +60,6 @@ mtype_destroy(struct ip_set *set)
{
struct mtype *map = set->data;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
-
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
ip_set_free(map->members);
@@ -264,7 +264,7 @@ out:
static void
mtype_gc(struct timer_list *t)
{
- struct mtype *map = from_timer(map, t, gc);
+ struct mtype *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
void *x;
u32 id;
@@ -288,6 +288,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_delete_sync(&map->gc);
+}
+
static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt,
.uadt = mtype_uadt,
@@ -301,6 +310,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head,
.list = mtype_list,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
};
#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 486959f70cf3..5988b9bb9029 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to) {
+ if (ip > ip_to)
swap(ip, ip_to);
- if (ip < map->first_ip)
- return -IPSET_ERR_BITMAP_RANGE;
- }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ip_to = ip;
}
- if (ip_to > map->last_ip)
+ if (ip < map->first_ip || ip_to > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
for (; !before(ip_to, ip); ip += map->hosts) {
@@ -308,8 +305,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
- hosts = 2 << (32 - netmask - 1);
- elements = 2 << (netmask - mask_bits - 1);
+ hosts = 2U << (32 - netmask - 1);
+ elements = 2UL << (netmask - mask_bits - 1);
}
if (elements > IPSET_BITMAP_MAX_RANGE + 1)
return -IPSET_ERR_BITMAP_RANGE_SIZE;
@@ -326,7 +323,7 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
set->variant = &bitmap_ip;
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 2310a316e0af..2c625e0f49ec 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -363,7 +363,7 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ipmac;
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index e56ced66f202..7138e080def4 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -274,7 +274,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_port;
if (!init_map_port(set, map, first_port, last_port)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 340cb955af25..cc20e6d56807 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -53,14 +53,17 @@ MODULE_DESCRIPTION("core IP set support");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
/* When the nfnl mutex or ip_set_ref_lock is held: */
-#define ip_set_dereference(p) \
- rcu_dereference_protected(p, \
+#define ip_set_dereference(inst) \
+ rcu_dereference_protected((inst)->ip_set_list, \
lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
- lockdep_is_held(&ip_set_ref_lock))
+ lockdep_is_held(&ip_set_ref_lock) || \
+ (inst)->is_deleted)
#define ip_set(inst, id) \
- ip_set_dereference((inst)->ip_set_list)[id]
+ ip_set_dereference(inst)[id]
#define ip_set_ref_netlink(inst,id) \
rcu_dereference_raw((inst)->ip_set_list)[id]
+#define ip_set_dereference_nfnl(p) \
+ rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* The set types are implemented in modules and registered set types
* can be found in ip_set_type_list. Adding/deleting types is
@@ -101,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision)
static bool
load_settype(const char *name)
{
+ if (!try_module_get(THIS_MODULE))
+ return false;
+
nfnl_unlock(NFNL_SUBSYS_IPSET);
pr_debug("try to load ip_set_%s\n", name);
if (request_module("ip_set_%s", name) < 0) {
pr_warn("Can't find ip_set type %s\n", name);
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return false;
}
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return true;
}
@@ -250,22 +258,7 @@ EXPORT_SYMBOL_GPL(ip_set_type_unregister);
void *
ip_set_alloc(size_t size)
{
- void *members = NULL;
-
- if (size < KMALLOC_MAX_SIZE)
- members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
- if (members) {
- pr_debug("%p: allocated with kmalloc\n", members);
- return members;
- }
-
- members = vzalloc(size);
- if (!members)
- return NULL;
- pr_debug("%p: allocated with vmalloc\n", members);
-
- return members;
+ return kvzalloc(size, GFP_KERNEL_ACCOUNT);
}
EXPORT_SYMBOL_GPL(ip_set_alloc);
@@ -286,8 +279,7 @@ flag_nested(const struct nlattr *nla)
static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
[IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
- [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr) },
+ [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
};
int
@@ -369,7 +361,7 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
if (unlikely(!c))
return;
- strlcpy(c->str, ext->comment, len + 1);
+ strscpy(c->str, ext->comment, len + 1);
set->ext_size += sizeof(*c) + strlen(c->str) + 1;
rcu_assign_pointer(comment->c, c);
}
@@ -460,6 +452,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
+ if (align < ip_set_extensions[id].align)
+ align = ip_set_extensions[id].align;
len = ALIGN(len, ip_set_extensions[id].align);
set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
@@ -650,13 +644,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set)) {
struct ip_set_counter *counter = ext_counter(data, set);
+ ip_set_update_counter(counter, ext, flags);
+
if (flags & IPSET_FLAG_MATCH_COUNTERS &&
!(ip_set_match_counter(ip_set_get_packets(counter),
mext->packets, mext->packets_op) &&
ip_set_match_counter(ip_set_get_bytes(counter),
mext->bytes, mext->bytes_op)))
return false;
- ip_set_update_counter(counter, ext, flags);
}
if (SET_WITH_SKBINFO(set))
ip_set_get_skbinfo(ext_skbinfo(data, set),
@@ -696,6 +691,14 @@ __ip_set_put(struct ip_set *set)
* a separate reference counter
*/
static void
+__ip_set_get_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref_netlink++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static void
__ip_set_put_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -713,15 +716,10 @@ __ip_set_put_netlink(struct ip_set *set)
static struct ip_set *
ip_set_rcu_get(struct net *net, ip_set_id_t index)
{
- struct ip_set *set;
struct ip_set_net *inst = ip_set_pernet(net);
- rcu_read_lock();
- /* ip_set_list itself needs to be protected */
- set = rcu_dereference(inst->ip_set_list)[index];
- rcu_read_unlock();
-
- return set;
+ /* ip_set_list and the set pointer need to be protected */
+ return ip_set_dereference_nfnl(inst->ip_set_list)[index];
}
static inline void
@@ -752,9 +750,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;
- rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
- rcu_read_unlock_bh();
if (ret == -EAGAIN) {
/* Type requests element to be completed */
@@ -887,7 +883,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
BUG_ON(!set);
read_lock_bh(&ip_set_ref_lock);
- strncpy(name, set->name, IPSET_MAXNAMELEN);
+ strscpy_pad(name, set->name, IPSET_MAXNAMELEN);
read_unlock_bh(&ip_set_ref_lock);
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
@@ -976,20 +972,9 @@ static struct nlmsghdr *
start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
enum ipset_cmd cmd)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
-
- nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd),
- sizeof(*nfmsg), flags);
- if (!nlh)
- return NULL;
-
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = NFPROTO_IPV4;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- return nlh;
+ return nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags,
+ NFPROTO_IPV4, NFNETLINK_V0, 0);
}
/* Create a set */
@@ -1055,26 +1040,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
return 0;
}
-static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
return -EOPNOTSUPP;
}
-static int ip_set_create(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
const char *name, *typename;
u8 family, revision;
- u32 flags = flag_exist(nlh);
+ u32 flags = flag_exist(info->nlh);
int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
@@ -1100,7 +1081,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
if (!set)
return -ENOMEM;
spin_lock_init(&set->lock);
- strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ strscpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
@@ -1122,8 +1103,10 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
ret = -IPSET_ERR_PROTOCOL;
goto put_out;
}
+ /* Set create flags depending on the type revision */
+ set->flags |= set->type->create_flags[revision];
- ret = set->type->create(net, set, tb, flags);
+ ret = set->type->create(info->net, set, tb, flags);
if (ret != 0)
goto put_out;
@@ -1156,7 +1139,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
- tmp = ip_set_dereference(inst->ip_set_list);
+ tmp = ip_set_dereference(inst);
memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
rcu_assign_pointer(inst->ip_set_list, list);
/* Make sure all current packets have passed through */
@@ -1177,6 +1160,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
return ret;
cleanup:
+ set->variant->cancel_gc(set);
set->variant->destroy(set);
put_out:
module_put(set->type->me);
@@ -1194,23 +1178,56 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
+/* In order to return quickly when destroying a single set, it is split
+ * into two stages:
+ * - Cancel garbage collector
+ * - Destroy the set itself via call_rcu()
+ */
+
static void
-ip_set_destroy_set(struct ip_set *set)
+ip_set_destroy_set_rcu(struct rcu_head *head)
{
- pr_debug("set: %s\n", set->name);
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
- /* Must call it without holding any lock */
set->variant->destroy(set);
module_put(set->type->me);
kfree(set);
}
-static int ip_set_destroy(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static void
+_destroy_all_sets(struct ip_set_net *inst)
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set *set;
+ ip_set_id_t i;
+ bool need_wait = false;
+
+ /* First cancel gc's: set:list sets are flushed as well */
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ set->variant->cancel_gc(set);
+ if (set->type->features & IPSET_TYPE_NAME)
+ need_wait = true;
+ }
+ }
+ /* Must wait for flush to be really finished */
+ if (need_wait)
+ rcu_barrier();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ ip_set(inst, i) = NULL;
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+ }
+ }
+}
+
+static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
+{
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
int ret = 0;
@@ -1218,21 +1235,18 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
-
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
- * ip_set_put|get_nfnl_* functions, that way we
+ * ip_set_nfnl_get_* functions, that way we
* can safely check references here.
*
* list:set timer can only decrement the reference
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1242,29 +1256,34 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
}
inst->is_destroyed = true;
read_unlock_bh(&ip_set_ref_lock);
- for (i = 0; i < inst->ip_set_max; i++) {
- s = ip_set(inst, i);
- if (s) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(s);
- }
- }
+ _destroy_all_sets(inst);
/* Modified by ip_set_destroy() only, which is serialized */
inst->is_destroyed = false;
} else {
+ u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
- ret = -ENOENT;
+ if (!(flags & IPSET_FLAG_EXIST))
+ ret = -ENOENT;
goto out;
} else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1284,12 +1303,10 @@ ip_set_flush_set(struct ip_set *set)
ip_set_unlock(set);
}
-static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
@@ -1324,12 +1341,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
-static int ip_set_rename(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *s;
const char *name2;
ip_set_id_t i;
@@ -1358,7 +1373,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
goto out;
}
}
- strncpy(set->name, name2, IPSET_MAXNAMELEN);
+ strscpy_pad(set->name, name2, IPSET_MAXNAMELEN);
out:
write_unlock_bh(&ip_set_ref_lock);
@@ -1374,12 +1389,10 @@ out:
* so the ip_set_list always contains valid pointers to the sets.
*/
-static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *from, *to;
ip_set_id_t from_id, to_id;
char from_name[IPSET_MAXNAMELEN];
@@ -1414,9 +1427,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return -EBUSY;
}
- strncpy(from_name, from->name, IPSET_MAXNAMELEN);
- strncpy(from->name, to->name, IPSET_MAXNAMELEN);
- strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+ strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN);
+ strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN);
+ strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN);
swap(from->ref, to->ref);
ip_set(inst, from_id) = to;
@@ -1642,7 +1655,7 @@ dump_last:
goto next_set;
if (set->variant->uref)
set->variant->uref(set, cb, true);
- /* fall through */
+ fallthrough;
default:
ret = set->variant->list(set, skb, cb);
if (!cb->args[IPSET_CB_ARG0])
@@ -1689,10 +1702,8 @@ out:
return ret < 0 ? ret : skb->len;
}
-static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
@@ -1703,7 +1714,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
.dump = ip_set_dump_do,
.done = ip_set_dump_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -1719,8 +1730,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
};
static int
-call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
- struct nlattr *tb[], enum ipset_adt adt,
+call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt,
u32 flags, bool use_lineno)
{
int ret;
@@ -1728,13 +1739,22 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
+ if (retried) {
+ __ip_set_get_netlink(set);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ cond_resched();
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ __ip_set_put_netlink(set);
+ }
+
ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
ip_set_unlock(set);
retried = true;
- } while (ret == -EAGAIN &&
- set->variant->resize &&
- (ret = set->variant->resize(set, retried)) == 0);
+ } while (ret == -ERANGE ||
+ (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried)) == 0));
if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
return 0;
@@ -1753,11 +1773,13 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
skb2 = nlmsg_new(payload, GFP_KERNEL);
if (!skb2)
return -ENOMEM;
- rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ rep = nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = ret;
- memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len,
+ /* Bounds checked by the skb layer. */);
+
cmdattr = (void *)&errmsg->msg + min_len;
ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
@@ -1772,8 +1794,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
/* Signal netlink not to send its ACK/errmsg. */
return -EINTR;
}
@@ -1817,7 +1838,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
attr[IPSET_ATTR_DATA],
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, adt, flags,
+ ret = call_ad(net, ctnl, skb, set, tb, adt, flags,
use_lineno);
} else {
int nla_rem;
@@ -1828,7 +1849,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, adt,
+ ret = call_ad(net, ctnl, skb, set, tb, adt,
flags, use_lineno);
if (ret < 0)
return ret;
@@ -1837,30 +1858,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl,
return ret;
}
-static int ip_set_uadd(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- return ip_set_ad(net, ctnl, skb,
- IPSET_ADD, nlh, attr, extack);
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_ADD, info->nlh, attr, info->extack);
}
-static int ip_set_udel(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- return ip_set_ad(net, ctnl, skb,
- IPSET_DEL, nlh, attr, extack);
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_DEL, info->nlh, attr, info->extack);
}
-static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
@@ -1892,16 +1907,13 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
/* Get headed data of a set */
-static int ip_set_header(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
const struct ip_set *set;
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -1915,7 +1927,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_HEADER);
if (!nlh2)
goto nlmsg_failure;
@@ -1927,11 +1939,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1949,10 +1957,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
};
-static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
@@ -1975,7 +1981,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_TYPE);
if (!nlh2)
goto nlmsg_failure;
@@ -1988,11 +1994,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
nlmsg_end(skb2, nlh2);
pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2008,14 +2010,11 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
};
-static int ip_set_protocol(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
return -IPSET_ERR_PROTOCOL;
@@ -2024,7 +2023,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_PROTOCOL);
if (!nlh2)
goto nlmsg_failure;
@@ -2034,11 +2033,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2049,17 +2044,14 @@ nlmsg_failure:
/* Get set by name or index, from userspace */
-static int ip_set_byname(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -2073,7 +2065,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYNAME);
if (!nlh2)
goto nlmsg_failure;
@@ -2083,11 +2075,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2101,17 +2089,14 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_INDEX] = { .type = NLA_U16 },
};
-static int ip_set_byindex(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_INDEX]))
@@ -2128,7 +2113,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYINDEX);
if (!nlh2)
goto nlmsg_failure;
@@ -2137,11 +2122,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -2153,80 +2134,96 @@ nlmsg_failure:
static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
[IPSET_CMD_NONE] = {
.call = ip_set_none,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
},
[IPSET_CMD_CREATE] = {
.call = ip_set_create,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_create_policy,
},
[IPSET_CMD_DESTROY] = {
.call = ip_set_destroy,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_FLUSH] = {
.call = ip_set_flush,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_RENAME] = {
.call = ip_set_rename,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_SWAP] = {
.call = ip_set_swap,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_LIST] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_dump_policy,
},
[IPSET_CMD_SAVE] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_ADD] = {
.call = ip_set_uadd,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_DEL] = {
.call = ip_set_udel,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_TEST] = {
.call = ip_set_utest,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_HEADER] = {
.call = ip_set_header,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_TYPE] = {
.call = ip_set_type,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_type_policy,
},
[IPSET_CMD_PROTOCOL] = {
.call = ip_set_protocol,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_protocol_policy,
},
[IPSET_CMD_GET_BYNAME] = {
.call = ip_set_byname,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_GET_BYINDEX] = {
.call = ip_set_byindex,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_index_policy,
},
@@ -2390,29 +2387,25 @@ ip_set_net_init(struct net *net)
}
static void __net_exit
-ip_set_net_exit(struct net *net)
+ip_set_net_pre_exit(struct net *net)
{
struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set = NULL;
- ip_set_id_t i;
-
inst->is_deleted = true; /* flag for ip_set_nfnl_put */
+}
- nfnl_lock(NFNL_SUBSYS_IPSET);
- for (i = 0; i < inst->ip_set_max; i++) {
- set = ip_set(inst, i);
- if (set) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(set);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_IPSET);
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ _destroy_all_sets(inst);
kvfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
static struct pernet_operations ip_set_net_ops = {
.init = ip_set_net_init,
+ .pre_exit = ip_set_net_pre_exit,
.exit = ip_set_net_exit,
.id = &ip_set_net_id,
.size = sizeof(struct ip_set_net),
@@ -2451,8 +2444,11 @@ ip_set_fini(void)
{
nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
unregister_pernet_subsys(&ip_set_net_ops);
+
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
pr_debug("these are the famous last words\n");
}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 1ee43752d6d3..5e4453e9ef8e 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -5,6 +5,7 @@
#define _IP_SET_HASH_GEN_H
#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/jhash.h>
#include <linux/types.h>
#include <linux/netfilter/nfnetlink.h>
@@ -37,37 +38,12 @@
*/
/* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE 4
+#define AHASH_INIT_SIZE 2
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h) ((h)->ahash_max)
-
-static u8
-tune_ahash_max(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-
-#define TUNE_AHASH_MAX(h, multi) \
- ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
-#else
-#define AHASH_MAX(h) AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
-#endif
+#define AHASH_MAX(h) ((h)->bucketsize)
/* A hash bucket */
struct hbucket {
@@ -87,8 +63,8 @@ struct hbucket {
: jhash_size((htable_bits) - HTABLE_REGION_BITS))
#define ahash_sizeof_regions(htable_bits) \
(ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
-#define ahash_region(n, htable_bits) \
- ((n) % ahash_numof_locks(htable_bits))
+#define ahash_region(n) \
+ ((n) / jhash_size(HTABLE_REGION_BITS))
#define ahash_bucket_start(h, htable_bits) \
((htable_bits) < HTABLE_REGION_BITS ? 0 \
: (h) * jhash_size(HTABLE_REGION_BITS))
@@ -132,31 +108,17 @@ htable_size(u8 hbits)
{
size_t hsize;
- /* We must fit both into u32 in jhash and size_t */
+ /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
- if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
+ if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
- /* Assume that hashsize == 2^htable_bits */
- u8 bits = fls(hashsize - 1);
-
- if (jhash_size(bits) != hashsize)
- /* Round up to the first 2^n value */
- bits = fls(hashsize);
-
- return bits;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -198,6 +160,17 @@ htable_bits(u32 hashsize)
(SET_WITH_TIMEOUT(set) && \
ip_set_timeout_expired(ext_timeout(d, set)))
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+static const union nf_inet_addr onesmask = {
+ .all[0] = 0xffffffff,
+ .all[1] = 0xffffffff,
+ .all[2] = 0xffffffff,
+ .all[3] = 0xffffffff
+};
+
+static const union nf_inet_addr zeromask = {};
+#endif
+
#endif /* _IP_SET_HASH_GEN_H */
#ifndef MTYPE
@@ -249,6 +222,7 @@ htable_bits(u32 hashsize)
#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
+#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match
@@ -293,6 +267,7 @@ htable_bits(u32 hashsize)
#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
@@ -321,11 +296,10 @@ struct htype {
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
-#ifdef IP_SET_HASH_WITH_MULTI
- u8 ahash_max; /* max elements in an array block */
-#endif
-#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 bucketsize; /* max elements in an array block */
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
u8 netmask; /* netmask value for subnets to store */
+ union nf_inet_addr bitmask; /* stores bitmask */
#endif
struct list_head ad; /* Resize add|del backlist */
struct mtype_elem next; /* temporary storage for uadd */
@@ -458,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference(hbucket(t, i));
+ n = (__force struct hbucket *)hbucket(t, i);
if (!n)
continue;
if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -478,10 +452,7 @@ mtype_destroy(struct ip_set *set)
struct htype *h = set->data;
struct list_head *l, *lt;
- if (SET_WITH_TIMEOUT(set))
- cancel_delayed_work_sync(&h->gc.dwork);
-
- mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+ mtype_ahash_destroy(set, (__force struct htable *)h->table, true);
list_for_each_safe(l, lt, &h->ad) {
list_del(l);
kfree(l);
@@ -500,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
/* Resizing changes htable_bits, so we ignore it */
return x->maxelem == y->maxelem &&
a->timeout == b->timeout &&
-#ifdef IP_SET_HASH_WITH_NETMASK
- x->netmask == y->netmask &&
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ nf_inet_addr_cmp(&x->bitmask, &y->bitmask) &&
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
x->markmask == y->markmask &&
@@ -627,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc)
queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ cancel_delayed_work_sync(&h->gc.dwork);
+}
+
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags);
@@ -644,7 +624,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t dsize = set->dsize;
+ size_t hsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -668,21 +648,19 @@ mtype_resize(struct ip_set *set, bool retried)
retry:
ret = 0;
htable_bits++;
- if (!htable_bits) {
- /* In case we have plenty of memory :-) */
- pr_warn("Cannot increase the hashsize of set %s further\n",
- set->name);
- ret = -IPSET_ERR_HASH_FULL;
- goto out;
- }
- t = ip_set_alloc(htable_size(htable_bits));
+ if (!htable_bits)
+ goto hbwarn;
+ hsize = htable_size(htable_bits);
+ if (!hsize)
+ goto hbwarn;
+ t = ip_set_alloc(hsize);
if (!t) {
ret = -ENOMEM;
goto out;
}
t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
if (!t->hregion) {
- kfree(t);
+ ip_set_free(t);
ret = -ENOMEM;
goto out;
}
@@ -724,7 +702,7 @@ retry:
#endif
key = HKEY(data, h->initval, htable_bits);
m = __ipset_dereference(hbucket(t, key));
- nr = ahash_region(key, htable_bits);
+ nr = ahash_region(key);
if (!m) {
m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
@@ -817,6 +795,12 @@ cleanup:
if (ret == -EAGAIN)
goto retry;
goto out;
+
+hbwarn:
+ /* In case we have plenty of memory :-) */
+ pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
}
/* Get the current number of elements and ext_size in the set */
@@ -868,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
elements = t->hregion[r].elements;
maxelem = t->maxelem;
@@ -950,7 +934,12 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_AHASH_MAX(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize <= multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
@@ -1061,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
rcu_read_unlock_bh();
@@ -1296,15 +1285,32 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
htonl(jhash_size(htable_bits))) ||
nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_BITMASK
+ /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask
+ * and not bitmask. These two are mutually exclusive. */
+ if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) {
+ if (set->family == NFPROTO_IPV4) {
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip))
+ goto nla_put_failure;
+ } else if (set->family == NFPROTO_IPV6) {
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6))
+ goto nla_put_failure;
+ }
+ }
+#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- if (h->netmask != HOST_MASK &&
- nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
goto nla_put_failure;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure;
#endif
+ if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) {
+ if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) ||
+ nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval)))
+ goto nla_put_failure;
+ }
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
@@ -1443,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
.region_lock = true,
};
@@ -1456,8 +1463,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
u32 markmask;
#endif
u8 hbits;
-#ifdef IP_SET_HASH_WITH_NETMASK
- u8 netmask;
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ int ret __attribute__((unused)) = 0;
+ u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ union nf_inet_addr bitmask = onesmask;
#endif
size_t hsize;
struct htype *h;
@@ -1495,7 +1504,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
@@ -1503,6 +1511,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
(set->family == NFPROTO_IPV6 && netmask > 128) ||
netmask == 0)
return -IPSET_ERR_INVALID_NETMASK;
+
+ /* we convert netmask to bitmask and store it */
+ if (set->family == NFPROTO_IPV4)
+ bitmask.ip = ip_set_netmask(netmask);
+ else
+ ip6_netmask(&bitmask, netmask);
+ }
+#endif
+
+#ifdef IP_SET_HASH_WITH_BITMASK
+ if (tb[IPSET_ATTR_BITMASK]) {
+ /* bitmask and netmask do the same thing, allow only one of these options */
+ if (tb[IPSET_ATTR_NETMASK])
+ return -IPSET_ERR_BITMASK_NETMASK_EXCL;
+
+ if (set->family == NFPROTO_IPV4) {
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip);
+ if (ret || !bitmask.ip)
+ return -IPSET_ERR_INVALID_NETMASK;
+ } else if (set->family == NFPROTO_IPV6) {
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask);
+ if (ret || ipv6_addr_any(&bitmask.in6))
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+
+ if (nf_inet_addr_cmp(&bitmask, &zeromask))
+ return -IPSET_ERR_INVALID_NETMASK;
}
#endif
@@ -1520,7 +1555,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (!h)
return -ENOMEM;
- hbits = htable_bits(hashsize);
+ /* Compute htable_bits from the user input parameter hashsize.
+ * Assume that hashsize == 2^htable_bits,
+ * otherwise round up to the first 2^n value.
+ */
+ hbits = fls(hashsize - 1);
hsize = htable_size(hbits);
if (hsize == 0) {
kfree(h);
@@ -1533,7 +1572,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
}
t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
if (!t->hregion) {
- kfree(t);
+ ip_set_free(t);
kfree(h);
return -ENOMEM;
}
@@ -1541,14 +1580,27 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
for (i = 0; i < ahash_numof_locks(hbits); i++)
spin_lock_init(&t->hregion[i].lock);
h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ h->bitmask = bitmask;
h->netmask = netmask;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
h->markmask = markmask;
#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
-
+ if (tb[IPSET_ATTR_INITVAL])
+ h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL]));
+ else
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->bucketsize = AHASH_MAX_SIZE;
+ if (tb[IPSET_ATTR_BUCKETSIZE]) {
+ h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]);
+ if (h->bucketsize < AHASH_INIT_SIZE)
+ h->bucketsize = AHASH_INIT_SIZE;
+ else if (h->bucketsize > AHASH_MAX_SIZE)
+ h->bucketsize = AHASH_MAX_SIZE;
+ else if (h->bucketsize % 2)
+ h->bucketsize += 1;
+ }
t->htable_bits = hbits;
t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 5d6d68eaf6a9..c9f4e3859663 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -23,7 +23,9 @@
/* 1 Counters support */
/* 2 Comments support */
/* 3 Forceadd support */
-#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
+/* 4 skbinfo support */
+/* 5 bucketsize, initval support */
+#define IPSET_TYPE_REV_MAX 6 /* bitmask support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -33,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip");
/* Type specific function prefix */
#define HTYPE hash_ip
#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -85,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
__be32 ip;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
- ip &= ip_set_netmask(h->netmask);
+ ip &= h->bitmask.ip;
if (ip == 0)
return -EINVAL;
@@ -97,11 +100,11 @@ static int
hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip4 *h = set->data;
+ struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0, hosts;
+ u32 ip = 0, ip_to = 0, hosts, i = 0;
int ret = 0;
if (tb[IPSET_ATTR_LINENO])
@@ -118,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- ip &= ip_set_hostmask(h->netmask);
+ ip &= ntohl(h->bitmask.ip);
e.ip = htonl(ip);
if (e.ip == 0)
return -IPSET_ERR_HASH_ELEM;
@@ -131,8 +134,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -143,18 +149,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
- if (retried) {
+ if (retried)
ip = ntohl(h->next.ip);
+ for (; ip <= ip_to; i++) {
e.ip = htonl(ip);
- }
- for (; ip <= ip_to;) {
+ if (i > IPSET_MAX_RANGE) {
+ hash_ip4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
ip += hosts;
- e.ip = htonl(ip);
- if (e.ip == 0)
+ if (ip == 0)
return 0;
ret = 0;
@@ -179,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
}
-static void
-hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
- ip6_netmask(ip, prefix);
-}
-
static bool
hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
@@ -221,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -EINVAL;
@@ -260,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -IPSET_ERR_HASH_ELEM;
@@ -277,14 +279,17 @@ static struct ip_set_type hash_ip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index eceb7bc4a93a..467c59a83c0a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -23,7 +23,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
@@ -268,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index aba1df617d6e..a22ec1a6f6ec 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -21,7 +21,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */
+/* 2 skbinfo support */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
@@ -96,11 +97,11 @@ static int
hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark4 *h = set->data;
+ struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0;
+ u32 ip, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -120,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
+ if (e.mark == 0 && e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
@@ -132,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (e.mark == 0 && ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -144,8 +150,12 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
- for (; ip <= ip_to; ip++) {
+ for (; ip <= ip_to; ip++, i++) {
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipmark4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -274,12 +284,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmark_create,
.create_policy = {
[IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 1ff228717e29..e977b5a9c48d 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -25,7 +25,9 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+/* 6 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -34,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port");
/* Type specific function prefix */
#define HTYPE hash_ipport
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -91,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -104,11 +112,11 @@ static int
hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport4 *h = set->data;
+ struct hash_ipport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0, p = 0, port, port_to;
+ u32 ip, ip_to = 0, p = 0, port, port_to, i = 0;
bool with_ports = false;
int ret;
@@ -128,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -177,9 +189,13 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; ip <= ip_to; ip++) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.ip = htonl(ip);
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipport4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -249,12 +265,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -294,6 +315,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -341,15 +366,19 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fa88afd812fa..39a01934b153 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -25,7 +25,8 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -107,11 +108,11 @@ static int
hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip4 *h = set->data;
+ struct hash_ipportip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0, p = 0, port, port_to;
+ u32 ip, ip_to = 0, p = 0, port, port_to, i = 0;
bool with_ports = false;
int ret;
@@ -184,9 +185,13 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; ip <= ip_to; ip++) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.ip = htonl(ip);
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipportip4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -356,11 +361,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index eef6ecfcb409..5c6de605a9fb 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -27,7 +27,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -159,12 +160,12 @@ static int
hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet4 *h = set->data;
+ struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, p = 0, port, port_to;
- u32 ip2_from = 0, ip2_to = 0, ip2;
+ u32 ip2_from = 0, ip2_to = 0, ip2, i = 0;
bool with_ports = false;
u8 cidr;
int ret;
@@ -278,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; p <= port_to; p++) {
e.port = htons(p);
do {
+ i++;
e.ip2 = htonl(ip2);
ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr);
e.cidr = cidr - 1;
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipportnet4_data_next(&h->next,
+ &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -513,11 +520,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 0b61593165ef..718814730acf 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -16,7 +16,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -125,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 136cf0781d3a..ce0a9ce5a91f 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -24,7 +24,8 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -135,11 +136,11 @@ static int
hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_net4 *h = set->data;
+ struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = { .cidr = HOST_MASK };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -187,10 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
}
+
if (retried)
ip = ntohl(h->next.ip);
do {
+ i++;
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_net4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -354,11 +361,13 @@ static struct ip_set_type hash_net_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_net_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index be5e95a0d876..30a655e5c4fd 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -26,7 +26,8 @@
/* 4 Comments support added */
/* 5 Forceadd support added */
/* 6 skbinfo support added */
-#define IPSET_TYPE_REV_MAX 7 /* interface wildcard support added */
+/* 7 interface wildcard support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -39,7 +40,7 @@ MODULE_ALIAS("ip_set_hash:net,iface");
#define IP_SET_HASH_WITH_MULTI
#define IP_SET_HASH_WITH_NET0
-#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ)
+#define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ)
/* IPv4 variant */
@@ -137,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
#include "ip_set_hash_gen.h"
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static const char *get_physindev_name(const struct sk_buff *skb)
+static const char *get_physindev_name(const struct sk_buff *skb, struct net *net)
{
- struct net_device *dev = nf_bridge_get_physindev(skb);
+ struct net_device *dev = nf_bridge_get_physindev(skb, net);
return dev ? dev->name : NULL;
}
@@ -176,16 +177,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
@@ -201,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -225,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -259,7 +260,12 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
do {
+ i++;
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netiface4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
ret = adtfn(set, &e, &ext, &ext, flags);
@@ -389,16 +395,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
@@ -442,7 +448,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip, e.cidr);
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -470,11 +476,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netiface_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index da4ef910b12d..8fbe649c9dd3 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -22,7 +22,9 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+/* 3 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -32,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net");
/* Type specific function prefix */
#define HTYPE hash_netnet
#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
#define IPSET_NET_COUNT 2
/* IPv4 variants */
@@ -152,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
- e.ip[0] &= ip_set_netmask(e.cidr[0]);
- e.ip[1] &= ip_set_netmask(e.cidr[1]);
+ e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip);
+ e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -162,12 +166,12 @@ static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netnet4 *h = set->data;
+ struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0;
- u32 ip2 = 0, ip2_from = 0, ip2_to = 0;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -211,8 +215,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_IP2_TO])) {
- e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
- e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1]));
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
@@ -255,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
e.ip[0] = htonl(ip);
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
do {
+ i++;
e.ip[1] = htonl(ip2);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netnet4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -389,6 +398,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -399,6 +413,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ const struct hash_netnet6 *h = set->data;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -438,6 +453,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -IPSET_ERR_HASH_ELEM;
+
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -459,14 +479,18 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 34448df80fb9..d1a0628df4ef 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -26,7 +26,8 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -153,11 +154,11 @@ static int
hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport4 *h = set->data;
+ struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 port, port_to, p = 0, ip = 0, ip_to = 0;
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0;
bool with_ports = false;
u8 cidr;
int ret;
@@ -245,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
e.ip = htonl(ip);
ip = ip_set_range_to_cidr(ip, ip_to, &cidr);
e.cidr = cidr - 1;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netport4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -460,11 +465,13 @@ static struct ip_set_type hash_netport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 934c1712cba8..bf4f91b78e1d 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -23,7 +23,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 0 Comments support added */
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -35,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net");
#define IP_SET_HASH_WITH_PROTO
#define IP_SET_HASH_WITH_NETS
#define IPSET_NET_COUNT 2
+#define IP_SET_HASH_WITH_NET0
/* IPv4 variant */
@@ -172,16 +174,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
+static u32
+hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+ if (from == 0 && to == UINT_MAX) {
+ *cidr = 0;
+ return to;
+ }
+ return ip_set_range_to_cidr(from, to, cidr);
+}
+
static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet4 *h = set->data;
+ struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, p = 0, port, port_to;
- u32 ip2_from = 0, ip2_to = 0, ip2;
+ u32 ip2_from = 0, ip2_to = 0, ip2, i = 0;
bool with_ports = false;
int ret;
@@ -295,13 +307,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
do {
e.ip[0] = htonl(ip);
- ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
+ ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]);
for (; p <= port_to; p++) {
e.port = htons(p);
do {
+ i++;
e.ip[1] = htonl(ip2);
- ip2 = ip_set_range_to_cidr(ip2, ip2_to,
- &e.cidr[1]);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netportnet4_data_next(&h->next,
+ &e);
+ return -ERANGE;
+ }
+ ip2 = hash_netportnet4_range_to_cidr(ip2,
+ ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -558,11 +576,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 5a67f7966574..13c7a08aa868 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -79,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -99,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -188,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct list_set *map = set->data;
struct set_adt_elem *d = value;
struct set_elem *e, *next, *prev = NULL;
- int ret;
+ int ret = 0;
- list_for_each_entry(e, &map->members, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -201,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (d->before == 0) {
ret = 1;
+ goto out;
} else if (d->before > 0) {
next = list_next_entry(e, list);
ret = !list_is_last(&e->list, &map->members) &&
@@ -208,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
} else {
ret = prev && prev->id == d->refid;
}
- return ret;
+ goto out;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static void
@@ -239,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
/* Find where to add the new entry */
n = prev = next = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -316,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e, *next, *prev = NULL;
+ struct set_elem *e, *n, *next, *prev = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_safe(e, n, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -424,17 +428,8 @@ static void
list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e, *n;
-
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
- list_for_each_entry_safe(e, n, &map->members, list) {
- list_del(&e->list);
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- kfree(e);
- }
+ WARN_ON_ONCE(!list_empty(&map->members));
kfree(map);
set->data = NULL;
@@ -545,6 +540,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_shutdown_sync(&map->gc);
+
+ /* Flush list to drop references to other ipsets */
+ list_set_flush(set);
+}
+
static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
@@ -558,12 +565,13 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head,
.list = list_set_list,
.same_set = list_set_same_set,
+ .cancel_gc = list_set_cancel_gc,
};
static void
list_set_gc(struct timer_list *t)
{
- struct list_set *map = from_timer(map, t, gc);
+ struct list_set *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
spin_lock_bh(&set->lock);
@@ -603,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size)
return true;
}
+static struct lock_class_key list_set_lockdep_key;
+
static int
list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
u32 flags)
@@ -619,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (size < IP_SET_LIST_MIN_SIZE)
size = IP_SET_LIST_MIN_SIZE;
+ lockdep_set_class(&set->lock, &list_set_lockdep_key);
set->variant = &set_variant;
set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
__alignof__(struct set_elem));
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 2c1593089ede..c203252e856d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -4,7 +4,7 @@
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
depends on (NF_CONNTRACK || NF_CONNTRACK=n)
help
IP Virtual Server support will let you build a high-performance
@@ -29,7 +29,6 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
- select IP6_NF_IPTABLES
select NF_DEFRAG_IPV6
help
Add IPv6 support to IPVS.
@@ -45,7 +44,8 @@ config IP_VS_DEBUG
config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
- range 8 20
+ range 8 20 if !64BIT
+ range 8 27 if 64BIT
default 12
help
The IPVS connection hash table uses the chaining scheme to handle
@@ -55,24 +55,24 @@ config IP_VS_TAB_BITS
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
- from 8 to 20, the default number is 12, which means the table size
- is 4096. Don't input the number too small, otherwise you will lose
- performance on it. You can adapt the table size yourself, according
- to your virtual server application. It is good to set the table size
- not far less than the number of connections per second multiplying
- average lasting time of connection in the table. For example, your
- virtual server gets 200 connections per second, the connection lasts
- for 200 seconds in average in the connection table, the table size
- should be not far less than 200x200, it is good to set the table
- size 32768 (2**15).
+ from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
+ which means the table size is 4096. Don't input the number too
+ small, otherwise you will lose performance on it. You can adapt the
+ table size yourself, according to your virtual server application.
+ It is good to set the table size not far less than the number of
+ connections per second multiplying average lasting time of
+ connection in the table. For example, your virtual server gets 200
+ connections per second, the connection lasts for 200 seconds in
+ average in the connection table, the table size should be not far
+ less than 200x200, it is good to set the table size 32768 (2**15).
Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
You can overwrite this number setting conn_tab_bits module parameter
- or by appending ip_vs.conn_tab_bits=? to the kernel command line
- if IP VS was compiled built-in.
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line if
+ IP VS was compiled built-in.
comment "IPVS transport protocol load balancing support"
@@ -105,7 +105,7 @@ config IP_VS_PROTO_AH
config IP_VS_PROTO_SCTP
bool "SCTP load balancing support"
- select LIBCRC32C
+ select NET_CRC32C
help
This option enables support for load balancing SCTP transport
protocol. Say Y if unsure.
@@ -272,6 +272,17 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_TWOS
+ tristate "weighted random twos choice least-connection scheduling"
+ help
+ The weighted random twos choice least-connection scheduling
+ algorithm picks two random real servers and directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
comment 'IPVS SH scheduler'
config IP_VS_SH_TAB_BITS
@@ -308,7 +319,7 @@ config IP_VS_MH_TAB_INDEX
comment 'IPVS application helper'
config IP_VS_FTP
- tristate "FTP protocol helper"
+ tristate "FTP protocol helper"
depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
NF_CONNTRACK_FTP
select IP_VS_NFCT
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index bfce2677fda2..bb5d8125c82a 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index f9b16f2b2219..d54d7da58334 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -13,8 +13,7 @@
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -599,13 +598,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 02f2f636798d..50cc492c7553 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -17,8 +17,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/interrupt.h>
#include <linux/in.h>
@@ -26,12 +25,12 @@
#include <linux/net.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/rcupdate_wait.h>
#include <net/net_namespace.h>
#include <net/ip_vs.h>
@@ -402,6 +401,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
+ const union nf_inet_addr *saddr;
+ __be16 sport;
/*
* Check for "full" addressed entries
@@ -411,10 +412,20 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->vport == cp->cport && p->cport == cp->dport &&
- cp->af == p->af &&
+ if (p->vport != cp->cport)
+ continue;
+
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ sport = cp->vport;
+ saddr = &cp->vaddr;
+ } else {
+ sport = cp->dport;
+ saddr = &cp->daddr;
+ }
+
+ if (p->cport == sport && cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
- ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, saddr) &&
p->protocol == cp->protocol &&
cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
@@ -807,9 +818,34 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
kmem_cache_free(ip_vs_conn_cachep, cp);
}
+/* Try to delete connection while not holding reference */
+static void ip_vs_conn_del(struct ip_vs_conn *cp)
+{
+ if (timer_delete(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ ip_vs_conn_expire(&cp->timer);
+ }
+}
+
+/* Try to delete connection while holding reference */
+static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
+{
+ if (timer_delete(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire(&cp->timer);
+ } else {
+ __ip_vs_conn_put(cp);
+ }
+}
+
static void ip_vs_conn_expire(struct timer_list *t)
{
- struct ip_vs_conn *cp = from_timer(cp, t, timer);
+ struct ip_vs_conn *cp = timer_container_of(cp, t, timer);
struct netns_ipvs *ipvs = cp->ipvs;
/*
@@ -823,18 +859,21 @@ static void ip_vs_conn_expire(struct timer_list *t)
struct ip_vs_conn *ct = cp->control;
/* delete the timer if it is activated by other users */
- del_timer(&cp->timer);
+ timer_delete(&cp->timer);
/* does anybody control me? */
if (ct) {
+ bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
+
ip_vs_control_del(cp);
/* Drop CTL or non-assured TPL if not used anymore */
- if (!cp->timeout && !atomic_read(&ct->n_control) &&
+ if (has_ref && !atomic_read(&ct->n_control) &&
(!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
!(ct->state & IP_VS_CTPL_S_ASSURED))) {
IP_VS_DBG(4, "drop controlling connection\n");
- ct->timeout = 0;
- ip_vs_conn_expire_now(ct);
+ ip_vs_conn_del_put(ct);
+ } else if (has_ref) {
+ __ip_vs_conn_put(ct);
}
}
@@ -845,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
* conntrack cleanup for the net.
*/
smp_rmb();
- if (ipvs->enable)
+ if (READ_ONCE(ipvs->enable))
ip_vs_conn_drop_conntrack(cp);
}
@@ -886,7 +925,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
/* Using mod_timer_pending will ensure the timer is not
- * modified after the final del_timer in ip_vs_conn_expire.
+ * modified after the final timer_delete in ip_vs_conn_expire.
*/
if (timer_pending(&cp->timer) &&
time_after(cp->timer.expires, jiffies))
@@ -1006,28 +1045,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
#ifdef CONFIG_PROC_FS
struct ip_vs_iter_state {
struct seq_net_private p;
- struct hlist_head *l;
+ unsigned int bucket;
+ unsigned int skip_elems;
};
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+static void *ip_vs_conn_array(struct ip_vs_iter_state *iter)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_iter_state *iter = seq->private;
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) {
+ unsigned int skip = 0;
+
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
/* __ip_vs_conn_get() is not needed by
* ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
*/
- if (pos-- == 0) {
- iter->l = &ip_vs_conn_tab[idx];
+ if (skip >= iter->skip_elems) {
+ iter->bucket = idx;
return cp;
}
+
+ ++skip;
}
+
+ iter->skip_elems = 0;
cond_resched_rcu();
}
+ iter->bucket = idx;
return NULL;
}
@@ -1036,9 +1082,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip_vs_iter_state *iter = seq->private;
- iter->l = NULL;
rcu_read_lock();
- return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+ if (*pos == 0) {
+ iter->skip_elems = 0;
+ iter->bucket = 0;
+ return SEQ_START_TOKEN;
+ }
+
+ return ip_vs_conn_array(iter);
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -1046,28 +1097,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
struct hlist_node *e;
- struct hlist_head *l = iter->l;
- int idx;
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_conn_array(seq, 0);
+ return ip_vs_conn_array(iter);
/* more on same hash chain? */
e = rcu_dereference(hlist_next_rcu(&cp->c_list));
- if (e)
+ if (e) {
+ iter->skip_elems++;
return hlist_entry(e, struct ip_vs_conn, c_list);
-
- idx = l - ip_vs_conn_tab;
- while (++idx < ip_vs_conn_tab_size) {
- hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- iter->l = &ip_vs_conn_tab[idx];
- return cp;
- }
- cond_resched_rcu();
}
- iter->l = NULL;
- return NULL;
+
+ iter->skip_elems = 0;
+ iter->bucket++;
+
+ return ip_vs_conn_array(iter);
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
@@ -1225,8 +1270,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1268,7 +1313,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+ unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
@@ -1317,8 +1362,7 @@ try_drop:
drop:
IP_VS_DBG(4, "drop connection\n");
- cp->timeout = 0;
- ip_vs_conn_expire_now(cp);
+ ip_vs_conn_del(cp);
}
cond_resched_rcu();
}
@@ -1341,19 +1385,15 @@ flush_again:
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (cp->ipvs != ipvs)
continue;
- /* As timers are expired in LIFO order, restart
- * the timer of controlling connection first, so
- * that it is expired after us.
- */
+ if (atomic_read(&cp->n_control))
+ continue;
cp_c = cp->control;
- /* cp->control is valid only with reference to cp */
- if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
IP_VS_DBG(4, "del controlling connection\n");
- ip_vs_conn_expire_now(cp_c);
- __ip_vs_conn_put(cp);
+ ip_vs_conn_del(cp_c);
}
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
}
cond_resched_rcu();
}
@@ -1366,6 +1406,45 @@ flush_again:
goto flush_again;
}
}
+
+#ifdef CONFIG_SYSCTL
+void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (cp->ipvs != ipvs)
+ continue;
+
+ dest = cp->dest;
+ if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
+ continue;
+
+ if (atomic_read(&cp->n_control))
+ continue;
+
+ cp_c = cp->control;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
+ IP_VS_DBG(4, "del controlling connection\n");
+ ip_vs_conn_del(cp_c);
+ }
+ }
+ cond_resched_rcu();
+
+ /* netns clean up started, abort delayed work */
+ if (!READ_ONCE(ipvs->enable))
+ break;
+ }
+ rcu_read_unlock();
+}
+#endif
+
/*
* per netns init and exit
*/
@@ -1373,51 +1452,78 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
{
+ size_t tab_array_size;
+ int max_avail;
+#if BITS_PER_LONG > 32
+ int max = 27;
+#else
+ int max = 20;
+#endif
+ int min = 8;
int idx;
- /* Compute size and mask */
+ max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
+ max_avail -= 2; /* ~4 in hash row */
+ max_avail -= 1; /* IPVS up to 1/2 of mem */
+ max_avail -= order_base_2(sizeof(struct ip_vs_conn));
+ max = clamp(max_avail, min, max);
+ ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size,
- sizeof(*ip_vs_conn_tab)));
+ tab_array_size = array_size(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab));
+ ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab), GFP_KERNEL);
if (!ip_vs_conn_tab)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
- ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
- sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
if (!ip_vs_conn_cachep) {
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
return -ENOMEM;
}
- pr_info("Connection hash table configured "
- "(size=%d, memory=%ldKbytes)\n",
- ip_vs_conn_tab_size,
- (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
+ ip_vs_conn_tab_size, tab_array_size / 1024);
IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
@@ -1440,5 +1546,5 @@ void ip_vs_conn_cleanup(void)
rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index aa6a603a2425..90d56f92c0f6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -19,8 +19,7 @@
* Harald Welte don't use nfcache
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -68,18 +67,6 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
-#ifdef CONFIG_IP_VS_PROTO_TCP
-INDIRECT_CALLABLE_DECLARE(int
- tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
-#endif
-
-#ifdef CONFIG_IP_VS_PROTO_UDP
-INDIRECT_CALLABLE_DECLARE(int
- udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
- struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
-#endif
-
#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
#define SNAT_CALL(f, ...) \
INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
@@ -144,21 +131,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -180,21 +167,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -212,17 +199,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
s = this_cpu_ptr(cp->dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -694,16 +681,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
return ipvs->sysctl_nat_icmp_send;
}
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
-{
- return ipvs->sysctl_expire_nodest_conn;
-}
-
#else
static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
#endif
@@ -748,12 +729,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(ipvs->net, skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -881,7 +862,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ goto after_nat;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -907,6 +888,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
+after_nat:
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -915,8 +897,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
-
-ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -1159,7 +1139,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
__be16 vport;
unsigned int flags;
- EnterFunction(12);
vaddr = &svc->addr;
vport = svc->port;
daddr = &iph->saddr;
@@ -1227,7 +1206,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
cp->flags, refcount_read(&cp->refcnt));
- LeaveFunction(12);
return cp;
}
@@ -1282,6 +1260,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto after_nat;
+
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
if (skb_ensure_writable(skb, iph->len))
@@ -1322,6 +1303,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
+after_nat:
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
@@ -1331,13 +1313,11 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
- LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
- LeaveFunction(11);
return NF_STOLEN;
}
@@ -1345,16 +1325,17 @@ drop:
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ int af = state->pf;
struct sock *sk;
- EnterFunction(11);
-
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
@@ -1364,16 +1345,13 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- if (!ipvs->enable)
- return NF_ACCEPT;
-
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1418,11 +1396,8 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
ipvs, af, skb, &iph);
- if (likely(cp)) {
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1481,66 +1456,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
-out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
-
-ignore_cp:
- __ip_vs_conn_put(cp);
- goto out;
}
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-#endif
-
static unsigned int
ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
@@ -1626,6 +1546,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
if (!dest)
goto unk;
if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
__be16 type;
/* Only support version 0 and C (csum) */
@@ -1636,7 +1557,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
if (type != htons(ETH_P_IP))
goto unk;
*proto = IPPROTO_IPIP;
- return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
+
+ gre_flags_to_tnl_flags(flags, greh->flags);
+
+ return gre_calc_hlen(flags);
}
unk:
@@ -1980,15 +1904,17 @@ out:
* and send it on its way...
*/
static unsigned int
-ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
- int conn_reuse_mode;
struct sock *sk;
+ int af = state->pf;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -2010,7 +1936,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, false, &iph);
@@ -2020,7 +1946,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
@@ -2064,16 +1990,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, af, skb, &iph);
- conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
- if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
- bool uses_ct = false, resched = false;
+ if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+ int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+ bool old_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
resched = true;
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
- } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
+ } else if (conn_reuse_mode &&
+ is_new_conn_expected(cp, conn_reuse_mode)) {
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;
} else {
@@ -2081,50 +2008,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
* that uses conntrack while it is still
* referenced by controlled connection(s).
*/
- resched = !uses_ct;
+ resched = !old_ct;
}
}
if (resched) {
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
- if (uses_ct)
+ if (old_ct)
return NF_DROP;
cp = NULL;
}
}
- if (unlikely(!cp)) {
- int v;
-
- if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
- return v;
- }
-
- IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
-
/* Check the server status */
- if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
- __u32 flags = cp->flags;
-
- /* when timer already started, silently drop the packet.*/
- if (timer_pending(&cp->timer))
- __ip_vs_conn_put(cp);
- else
- ip_vs_conn_put(cp);
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
- if (sysctl_expire_nodest_conn(ipvs) &&
- !(flags & IP_VS_CONN_F_ONE_PACKET)) {
- /* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ if (old_ct)
+ return NF_DROP;
+ cp = NULL;
+ } else {
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
}
+ }
- return NF_DROP;
+ if (unlikely(!cp)) {
+ int v;
+
+ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
+ return v;
}
+ IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
+
ip_vs_in_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
@@ -2147,7 +2075,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
pkts = sysctl_sync_threshold(ipvs);
else
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -2160,55 +2088,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
}
/*
- * AF_INET handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-/*
- * AF_INET handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-#endif
-
-
-/*
* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
@@ -2221,45 +2100,36 @@ static unsigned int
ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- int r;
struct netns_ipvs *ipvs = net_ipvs(state->net);
-
- if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
- return NF_ACCEPT;
+ int r;
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
- return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
-}
-
+ if (state->pf == NFPROTO_IPV4) {
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
#ifdef CONFIG_IP_VS_IPV6
-static unsigned int
-ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- int r;
- struct netns_ipvs *ipvs = net_ipvs(state->net);
- struct ip_vs_iphdr iphdr;
+ } else {
+ struct ip_vs_iphdr iphdr;
- ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- if (iphdr.protocol != IPPROTO_ICMPV6)
- return NF_ACCEPT;
+ ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- /* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
- return NF_ACCEPT;
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
- return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
-}
+ return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
#endif
+ }
+ return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
+}
-static const struct nf_hook_ops ip_vs_ops[] = {
+static const struct nf_hook_ops ip_vs_ops4[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
@@ -2268,21 +2138,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
@@ -2297,15 +2167,18 @@ static const struct nf_hook_ops ip_vs_ops[] = {
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
+};
+
#ifdef CONFIG_IP_VS_IPV6
+static const struct nf_hook_ops ip_vs_ops6[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 2,
@@ -2314,21 +2187,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 2,
@@ -2336,20 +2209,76 @@ static const struct nf_hook_ops ip_vs_ops[] = {
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
- .hook = ip_vs_forward_icmp_v6,
+ .hook = ip_vs_forward_icmp,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
-#endif
};
+#endif
+
+int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+ int ret = 0;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return -EINVAL;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (!(ipvs->hooks_afmask & afmask)) {
+ ret = nf_register_net_hooks(ipvs->net, ops, count);
+ if (ret >= 0)
+ ipvs->hooks_afmask |= afmask;
+ }
+ return ret;
+}
+
+void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (ipvs->hooks_afmask & afmask) {
+ nf_unregister_net_hooks(ipvs->net, ops, count);
+ ipvs->hooks_afmask &= ~afmask;
+ }
+}
+
/*
* Initialize IP Virtual Server netns mem.
*/
@@ -2361,8 +2290,8 @@ static int __net_init __ip_vs_init(struct net *net)
if (ipvs == NULL)
return -ENOMEM;
- /* Hold the beast until a service is registerd */
- ipvs->enable = 0;
+ /* Hold the beast until a service is registered */
+ WRITE_ONCE(ipvs->enable, 0);
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
@@ -2425,33 +2354,19 @@ static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
}
}
-static int __net_init __ip_vs_dev_init(struct net *net)
-{
- int ret;
-
- ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- if (ret < 0)
- goto hook_fail;
- return 0;
-
-hook_fail:
- return ret;
-}
-
static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
{
struct netns_ipvs *ipvs;
struct net *net;
- EnterFunction(2);
list_for_each_entry(net, net_list, exit_list) {
ipvs = net_ipvs(net);
- nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- ipvs->enable = 0; /* Disable packet reception */
+ ip_vs_unregister_hooks(ipvs, AF_INET);
+ ip_vs_unregister_hooks(ipvs, AF_INET6);
+ WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */
smp_wmb();
ip_vs_sync_net_cleanup(ipvs);
}
- LeaveFunction(2);
}
static struct pernet_operations ipvs_core_ops = {
@@ -2462,7 +2377,6 @@ static struct pernet_operations ipvs_core_ops = {
};
static struct pernet_operations ipvs_core_dev_ops = {
- .init = __ip_vs_dev_init,
.exit_batch = __ip_vs_dev_cleanup_batch,
};
@@ -2526,9 +2440,14 @@ static void __exit ip_vs_cleanup(void)
ip_vs_conn_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
+ /* common rcu_barrier() used by:
+ * - ip_vs_control_cleanup()
+ */
+ rcu_barrier();
pr_info("ipvs unloaded.\n");
}
module_init(ip_vs_init);
module_exit(ip_vs_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Virtual Server");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 412656c34f20..068702894377 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -13,8 +13,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/init.h>
@@ -24,7 +23,6 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
-#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -48,8 +46,9 @@
#include <net/ip_vs.h>
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
+
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
/* sysctl variables */
@@ -94,6 +93,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
int availmem;
+ int amemthresh;
int nomem;
int to_change = -1;
@@ -105,7 +105,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < ipvs->sysctl_amemthresh);
+ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
+ nomem = (availmem < amemthresh);
local_bh_disable();
@@ -145,9 +146,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 1:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
ipvs->sysctl_drop_packet = 2;
} else {
ipvs->drop_rate = 0;
@@ -155,9 +155,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 2:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
} else {
ipvs->drop_rate = 0;
ipvs->sysctl_drop_packet = 1;
@@ -210,6 +209,17 @@ static void update_defense_level(struct netns_ipvs *ipvs)
local_bh_enable();
}
+/* Handler for delayed work for expiring no
+ * destination connections
+ */
+static void expire_nodest_conn_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = container_of(work, struct netns_ipvs,
+ expire_nodest_conn_work.work);
+ ip_vs_expire_nodest_conn_flush(ipvs);
+}
/*
* Timer for checking the defense
@@ -224,10 +234,52 @@ static void defense_work_handler(struct work_struct *work)
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
ip_vs_random_dropentry(ipvs);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
}
#endif
+static void est_reload_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, est_reload_work.work);
+ int genid_done = atomic_read(&ipvs->est_genid_done);
+ unsigned long delay = HZ / 10; /* repeat startups after failure */
+ bool repeat = false;
+ int genid;
+ int id;
+
+ mutex_lock(&ipvs->est_mutex);
+ genid = atomic_read(&ipvs->est_genid);
+ for (id = 0; id < ipvs->est_kt_count; id++) {
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+ /* netns clean up started, abort delayed work */
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock;
+ if (!kd)
+ continue;
+ /* New config ? Stop kthread tasks */
+ if (genid != genid_done)
+ ip_vs_est_kthread_stop(kd);
+ if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ /* Do not start kthreads above 0 in calc phase */
+ if ((!id || !ipvs->est_calc_phase) &&
+ ip_vs_est_kthread_start(ipvs, kd) < 0)
+ repeat = true;
+ }
+ }
+
+ atomic_set(&ipvs->est_genid_done, genid);
+
+ if (repeat)
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+ delay);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+}
+
int
ip_vs_use_count_inc(void)
{
@@ -458,7 +510,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static void ip_vs_service_free(struct ip_vs_service *svc)
{
- free_percpu(svc->stats.cpustats);
+ ip_vs_stats_release(&svc->stats);
kfree(svc);
}
@@ -470,17 +522,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
ip_vs_service_free(svc);
}
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
- if (do_delay)
- call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
- else
- ip_vs_service_free(svc);
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
}
@@ -767,14 +816,22 @@ out:
return dest;
}
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest;
+
+ dest = container_of(head, struct ip_vs_dest, rcu_head);
+ ip_vs_stats_release(&dest->stats);
+ ip_vs_dest_put_and_free(dest);
+}
+
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
__ip_vs_dst_cache_reset(dest);
- __ip_vs_svc_put(svc, false);
- free_percpu(dest->stats.cpustats);
- ip_vs_dest_put_and_free(dest);
+ __ip_vs_svc_put(svc);
+ call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
}
/*
@@ -790,7 +847,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
- del_timer_sync(&ipvs->dest_trash_timer);
+ timer_delete_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
list_del(&dest->t_list);
@@ -798,12 +855,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
}
}
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_stats_rcu *rs = container_of(head,
+ struct ip_vs_stats_rcu,
+ rcu_head);
+
+ ip_vs_stats_release(&rs->s);
+ kfree(rs);
+}
+
static void
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
- spin_lock_bh(&src->lock);
+ spin_lock(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -813,7 +880,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
ip_vs_read_estimator(dst, src);
- spin_unlock_bh(&src->lock);
+ spin_unlock(&src->lock);
}
static void
@@ -834,7 +901,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
- spin_lock_bh(&stats->lock);
+ spin_lock(&stats->lock);
/* get current counters as zero point, rates are zeroed */
@@ -848,7 +915,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
ip_vs_zero_estimator(stats);
- spin_unlock_bh(&stats->lock);
+ spin_unlock(&stats->lock);
+}
+
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+ int i;
+
+ spin_lock_init(&s->lock);
+ s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!s->cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+ u64_stats_init(&cs->syncp);
+ }
+ return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+ struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s && ip_vs_stats_init_alloc(s) >= 0)
+ return s;
+ kfree(s);
+ return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+ free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+ if (stats) {
+ ip_vs_stats_release(stats);
+ kfree(stats);
+ }
}
/*
@@ -910,7 +1018,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
- __ip_vs_svc_put(old_svc, true);
+ __ip_vs_svc_put(old_svc);
}
}
@@ -929,7 +1037,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock_bh(&dest->dst_lock);
if (add) {
- ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -947,18 +1054,14 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Create a destination for the given service
*/
static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
- struct ip_vs_dest **dest_p)
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
- unsigned int atype, i;
-
- EnterFunction(2);
+ unsigned int atype;
+ int ret;
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
- int ret;
-
atype = ipv6_addr_type(&udest->addr.in6);
if ((!(atype & IPV6_ADDR_UNICAST) ||
atype & IPV6_ADDR_LINKLOCAL) &&
@@ -980,15 +1083,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
if (dest == NULL)
return -ENOMEM;
- dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!dest->stats.cpustats)
+ ret = ip_vs_stats_init_alloc(&dest->stats);
+ if (ret < 0)
goto err_alloc;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_dest_stats;
- ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
- u64_stats_init(&ip_vs_dest_stats->syncp);
- }
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err_stats;
dest->af = udest->af;
dest->protocol = svc->protocol;
@@ -1005,17 +1106,16 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
- spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
- *dest_p = dest;
-
- LeaveFunction(2);
return 0;
+err_stats:
+ ip_vs_stats_release(&dest->stats);
+
err_alloc:
kfree(dest);
- return -ENOMEM;
+ return ret;
}
@@ -1030,8 +1130,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
__be16 dport = udest->port;
int ret;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -1077,15 +1175,16 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ return ret;
__ip_vs_update_dest(svc, dest, udest, 1);
- ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
- ret = ip_vs_new_dest(svc, udest, &dest);
+ ret = ip_vs_new_dest(svc, udest);
}
- LeaveFunction(2);
return ret;
}
@@ -1101,8 +1200,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
union nf_inet_addr daddr;
__be16 dport = udest->port;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -1134,7 +1231,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
}
__ip_vs_update_dest(svc, dest, udest, 0);
- LeaveFunction(2);
return 0;
}
@@ -1163,6 +1259,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
list_add(&dest->t_list, &ipvs->dest_trash);
dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ /* Queue up delayed work to expire all no destination connections.
+ * No-op when CONFIG_SYSCTL is disabled.
+ */
+ if (!cleanup)
+ ip_vs_enqueue_expire_nodest_conns(ipvs);
}
@@ -1203,8 +1305,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
struct ip_vs_dest *dest;
__be16 dport = udest->port;
- EnterFunction(2);
-
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
@@ -1225,14 +1325,13 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
*/
__ip_vs_del_dest(svc->ipvs, dest, false);
- LeaveFunction(2);
-
return 0;
}
static void ip_vs_dest_trash_expire(struct timer_list *t)
{
- struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer);
+ struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
+ dest_trash_timer);
struct ip_vs_dest *dest, *next;
unsigned long now = jiffies;
@@ -1268,10 +1367,11 @@ static int
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0, i;
+ int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ int ret_hooks = -1;
/* increase the module use count */
if (!ip_vs_use_count_inc())
@@ -1313,24 +1413,23 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
}
#endif
+ if ((u->af == AF_INET && !ipvs->num_services) ||
+ (u->af == AF_INET6 && !ipvs->num_services6)) {
+ ret = ip_vs_register_hooks(ipvs, u->af);
+ if (ret < 0)
+ goto out_err;
+ ret_hooks = ret;
+ }
+
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
goto out_err;
}
- svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!svc->stats.cpustats) {
- ret = -ENOMEM;
+ ret = ip_vs_stats_init_alloc(&svc->stats);
+ if (ret < 0)
goto out_err;
- }
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_stats;
- ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
- u64_stats_init(&ip_vs_stats->syncp);
- }
-
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
@@ -1340,14 +1439,13 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
- svc->flags = u->flags;
+ svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->ipvs = ipvs;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
@@ -1357,34 +1455,47 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
sched = NULL;
}
- /* Bind the ct retriever */
- RCU_INIT_POINTER(svc->pe, pe);
- pe = NULL;
+ ret = ip_vs_start_estimator(ipvs, &svc->stats);
+ if (ret < 0)
+ goto out_err;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
- if (svc->pe && svc->pe->conn_out)
+ if (pe && pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
- ip_vs_start_estimator(ipvs, &svc->stats);
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
+ else if (svc->af == AF_INET6)
+ ipvs->num_services6++;
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
- /* Now there is a service - full throttle */
- ipvs->enable = 1;
+
+ if (!READ_ONCE(ipvs->enable)) {
+ /* Now there is a service - full throttle */
+ WRITE_ONCE(ipvs->enable, 1);
+
+ /* Start estimation for first time */
+ ip_vs_est_reload_start(ipvs);
+ }
+
return 0;
out_err:
+ if (ret_hooks >= 0)
+ ip_vs_unregister_hooks(ipvs, u->af);
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
@@ -1500,9 +1611,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
- /* Count only IPv4 services for old get/setsockopt interface */
- if (svc->af == AF_INET)
+ if (svc->af == AF_INET) {
ipvs->num_services--;
+ if (!ipvs->num_services)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ } else if (svc->af == AF_INET6) {
+ ipvs->num_services6--;
+ if (!ipvs->num_services6)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ }
ip_vs_stop_estimator(svc->ipvs, &svc->stats);
@@ -1536,7 +1653,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/*
* Free the service if nobody refers to it
*/
- __ip_vs_svc_put(svc, true);
+ __ip_vs_svc_put(svc);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1614,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
struct netns_ipvs *ipvs;
struct net *net;
- EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
list_for_each_entry(net, net_list, exit_list) {
@@ -1622,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
ip_vs_flush(ipvs, true);
}
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
}
/* Put all references for device (dst_cache) */
@@ -1660,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
- EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
@@ -1689,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
return NOTIFY_DONE;
}
@@ -1726,16 +1839,14 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
}
}
- ip_vs_zero_stats(&ipvs->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats->s);
return 0;
}
#ifdef CONFIG_SYSCTL
-static int three = 3;
-
static int
-proc_do_defense_mode(struct ctl_table *table, int write,
+proc_do_defense_mode(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
@@ -1762,9 +1873,10 @@ proc_do_defense_mode(struct ctl_table *table, int write,
}
static int
-proc_do_sync_threshold(struct ctl_table *table, int write,
+proc_do_sync_threshold(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val[2];
int rc;
@@ -1774,6 +1886,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
.mode = table->mode,
};
+ mutex_lock(&ipvs->sync_mutex);
memcpy(val, valp, sizeof(val));
rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write) {
@@ -1783,11 +1896,12 @@ proc_do_sync_threshold(struct ctl_table *table, int write,
else
memcpy(valp, val, sizeof(val));
}
+ mutex_unlock(&ipvs->sync_mutex);
return rc;
}
static int
-proc_do_sync_ports(struct ctl_table *table, int write,
+proc_do_sync_ports(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
@@ -1810,6 +1924,149 @@ proc_do_sync_ports(struct ctl_table *table, int write,
return rc;
}
+static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
+ void *buffer)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ cpumask_var_t newmask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buffer, newmask);
+ if (ret)
+ goto out;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (!ipvs->est_cpulist_valid) {
+ if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ipvs->est_cpulist_valid = 1;
+ }
+ cpumask_and(newmask, newmask, &current->cpus_mask);
+ cpumask_copy(*valp, newmask);
+ /* est_max_threads may depend on cpulist size */
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+ ipvs->est_calc_phase = 1;
+ ip_vs_est_reload_start(ipvs);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+
+out:
+ free_cpumask_var(newmask);
+ return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
+ void *buffer, size_t size)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ struct cpumask *mask;
+ int ret;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+ mutex_unlock(&ipvs->est_mutex);
+
+ return ret;
+}
+
+static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ /* Ignore both read and write(append) if *ppos not 0 */
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ /* proc_sys_call_handler() appends terminator */
+ ret = ipvs_proc_est_cpumask_set(table, buffer);
+ if (ret >= 0)
+ *ppos += *lenp;
+ } else {
+ /* proc_sys_call_handler() allocates 1 byte for terminator */
+ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+ if (ret >= 0) {
+ *lenp = ret;
+ *ppos += *lenp;
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < MIN_NICE || val > MAX_NICE) {
+ ret = -EINVAL;
+ } else {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ return ret;
+}
+
/*
* IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
* Do not change order or insert new entries without
@@ -1942,7 +2199,7 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &three,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "nat_icmp_send",
@@ -1980,6 +2237,24 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "run_estimation",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_run_estimation,
+ },
+ {
+ .procname = "est_cpulist",
+ .maxlen = NR_CPUS, /* unused */
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_cpulist,
+ },
+ {
+ .procname = "est_nice",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_nice,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1989,7 +2264,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec,
},
#endif
- { }
};
#endif
@@ -2216,7 +2490,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_puts(seq,
" Conns Packets Packets Bytes Bytes\n");
- ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)show.conns,
(unsigned long long)show.inpkts,
@@ -2240,7 +2514,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
- struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_kstats kstats;
int i;
@@ -2257,13 +2531,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
- conns = u->cnt.conns;
- inpkts = u->cnt.inpkts;
- outpkts = u->cnt.outpkts;
- inbytes = u->cnt.inbytes;
- outbytes = u->cnt.outbytes;
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ start = u64_stats_fetch_begin(&u->syncp);
+ conns = u64_stats_read(&u->cnt.conns);
+ inpkts = u64_stats_read(&u->cnt.inpkts);
+ outpkts = u64_stats_read(&u->cnt.outpkts);
+ inbytes = u64_stats_read(&u->cnt.inbytes);
+ outbytes = u64_stats_read(&u->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
@@ -2414,7 +2688,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
}
static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
{
struct net *net = sock_net(sk);
int ret;
@@ -2438,7 +2712,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
return -EINVAL;
}
- if (copy_from_user(arg, user, len) != 0)
+ if (copy_from_sockptr(arg, ptr, len) != 0)
return -EFAULT;
/* Handle daemons since they have another lock */
@@ -2471,6 +2745,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
+ } else if (!len) {
+ /* No more commands with len == 0 below */
+ ret = -EINVAL;
+ goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2549,7 +2827,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = ip_vs_del_dest(svc, &udest);
break;
default:
+ WARN_ON_ONCE(1);
ret = -EINVAL;
+ break;
}
out_unlock:
@@ -2571,7 +2851,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
+ strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2765,13 +3045,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
+ strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
+ strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->bcfg.syncid;
}
@@ -2811,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_services *)arg;
size = struct_size(get, entrytable, get->num_services);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -2852,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_dests *)arg;
size = struct_size(get, entrytable, get->num_dests);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -3382,10 +3662,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
udest->port = nla_get_be16(nla_port);
- if (nla_addr_family)
- udest->af = nla_get_u16(nla_addr_family);
- else
- udest->af = 0;
+ udest->af = nla_get_u16_default(nla_addr_family, 0);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
@@ -3521,7 +3798,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
sizeof(c.mcast_ifn));
c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
@@ -3855,7 +4132,7 @@ out:
}
-static const struct genl_ops ip_vs_genl_ops[] = {
+static const struct genl_small_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -3963,8 +4240,9 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
.policy = ip_vs_cmd_policy,
.netnsok = true, /* Make ipvsadm to work on netns */
.module = THIS_MODULE,
- .ops = ip_vs_genl_ops,
- .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .small_ops = ip_vs_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .resv_start_op = IPVS_CMD_FLUSH + 1,
};
static int __init ip_vs_genl_register(void)
@@ -3986,22 +4264,24 @@ static void ip_vs_genl_unregister(void)
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
- int idx;
struct ctl_table *tbl;
+ int idx, ret;
+ size_t ctl_table_size = ARRAY_SIZE(vs_vars);
+ bool unpriv = net->user_ns != &init_user_ns;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+ expire_nodest_conn_handler);
+ ipvs->est_stopped = 0;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
@@ -4027,10 +4307,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+
ipvs->sysctl_sync_sock_size = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
@@ -4039,6 +4326,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx].extra2 = ipvs;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
@@ -4053,29 +4341,66 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
- ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
- if (ipvs->sysctl_hdr == NULL) {
- if (!net_eq(net, &init_net))
- kfree(tbl);
- return -ENOMEM;
- }
- ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
+ ipvs->sysctl_run_estimation = 1;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+ ipvs->est_cpulist_valid = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+ ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_nice;
+
+#ifdef CONFIG_IP_VS_DEBUG
+ /* Global sysctls must be ro in non-init netns */
+ if (!net_eq(net, &init_net))
+ tbl[idx++].mode = 0444;
+#endif
+
+ ret = -ENOMEM;
+ ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
+ ctl_table_size);
+ if (!ipvs->sysctl_hdr)
+ goto err;
ipvs->sysctl_tbl = tbl;
+
+ ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ret < 0)
+ goto err;
+
/* Schedule defense work */
- INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
return 0;
+
+err:
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return ret;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
+ cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+
+ if (ipvs->est_cpulist_valid)
+ free_cpumask_var(ipvs->sysctl_est_cpulist);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -4097,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- int i, idx;
+ int ret = -ENOMEM;
+ int idx;
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4110,44 +4436,66 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
- /* procfs stats */
- ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!ipvs->tot_stats.cpustats)
- return -ENOMEM;
+ INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ipvs_tot_stats;
- ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
- u64_stats_init(&ipvs_tot_stats->syncp);
- }
-
- spin_lock_init(&ipvs->tot_stats.lock);
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+ if (!ipvs->tot_stats)
+ goto out;
+ if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
+ goto err_tot_stats;
- proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
- sizeof(struct ip_vs_iter));
- proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
- ip_vs_stats_show, NULL);
- proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
- ip_vs_stats_percpu_show, NULL);
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
+ &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
+ goto err_vs;
+ if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+ ip_vs_stats_show, NULL))
+ goto err_stats;
+ if (!proc_create_net_single("ip_vs_stats_percpu", 0,
+ ipvs->net->proc_net,
+ ip_vs_stats_percpu_show, NULL))
+ goto err_percpu;
+#endif
- if (ip_vs_control_net_init_sysctl(ipvs))
+ ret = ip_vs_control_net_init_sysctl(ipvs);
+ if (ret < 0)
goto err;
return 0;
err:
- free_percpu(ipvs->tot_stats.cpustats);
- return -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+
+err_percpu:
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+
+err_stats:
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
+
+err_vs:
+#endif
+ ip_vs_stats_release(&ipvs->tot_stats->s);
+
+err_tot_stats:
+ kfree(ipvs->tot_stats);
+
+out:
+ return ret;
}
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+ cancel_delayed_work_sync(&ipvs->est_reload_work);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
- free_percpu(ipvs->tot_stats.cpustats);
+#endif
+ call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
}
int __init ip_vs_register_nl_ioctl(void)
@@ -4184,8 +4532,6 @@ int __init ip_vs_control_init(void)
int idx;
int ret;
- EnterFunction(2);
-
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
@@ -4198,14 +4544,12 @@ int __init ip_vs_control_init(void)
if (ret < 0)
return ret;
- LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
- EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
- LeaveFunction(2);
+ /* relying on common rcu_barrier() in ip_vs_cleanup() */
}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 5e6ec32aff2b..bb7aca4601ff 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -30,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -270,3 +269,4 @@ static void __exit ip_vs_dh_cleanup(void)
module_init(ip_vs_dh_init);
module_exit(ip_vs_dh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs destination hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 05b8112ffb37..77f4f637ff67 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -12,8 +12,7 @@
* get_stats()) do the per cpu summing.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -21,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/sysctl.h>
#include <linux/list.h>
+#include <linux/rcupdate_wait.h>
#include <net/ip_vs.h>
@@ -30,9 +30,6 @@
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
- Currently, the measurement is activated by slow timer handler. Hope
- this measurement will not introduce too much load.
-
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
@@ -47,65 +44,79 @@
to 32-bit values for conns, packets, bps, cps and pps.
* A lot of code is taken from net/core/gen_estimator.c
- */
-
-/*
- * Make a summary from each cpu
+ KEY POINTS:
+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+ - kthreads read the cpustats to update the estimators (svcs, dests, total)
+ - the states of estimators can be read (get stats) or modified (zero stats)
+ from processes
+
+ KTHREADS:
+ - estimators are added initially to est_temp_list and later kthread 0
+ distributes them to one or many kthreads for estimation
+ - kthread contexts are created and attached to array
+ - the kthread tasks are started when first service is added, before that
+ the total stats are not estimated
+ - when configuration (cpulist/nice) is changed, the tasks are restarted
+ by work (est_reload_work)
+ - kthread tasks are stopped while the cpulist is empty
+ - the kthread context holds lists with estimators (chains) which are
+ processed every 2 seconds
+ - as estimators can be added dynamically and in bursts, we try to spread
+ them to multiple chains which are estimated at different time
+ - on start, kthread 0 enters calculation phase to determine the chain limits
+ and the limit of estimators per kthread
+ - est_add_ktid: ktid where to add new ests, can point to empty slot where
+ we should add kt data
*/
-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
- struct ip_vs_cpu_stats __percpu *stats)
-{
- int i;
- bool add = false;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
- unsigned int start;
- u64 conns, inpkts, outpkts, inbytes, outbytes;
-
- if (add) {
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- conns = s->cnt.conns;
- inpkts = s->cnt.inpkts;
- outpkts = s->cnt.outpkts;
- inbytes = s->cnt.inbytes;
- outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- sum->conns += conns;
- sum->inpkts += inpkts;
- sum->outpkts += outpkts;
- sum->inbytes += inbytes;
- sum->outbytes += outbytes;
- } else {
- add = true;
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- sum->conns = s->cnt.conns;
- sum->inpkts = s->cnt.inpkts;
- sum->outpkts = s->cnt.outpkts;
- sum->inbytes = s->cnt.inbytes;
- sum->outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- }
- }
-}
+static struct lock_class_key __ipvs_est_key;
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
-static void estimation_timer(struct timer_list *t)
+static void ip_vs_chain_estimation(struct hlist_head *chain)
{
struct ip_vs_estimator *e;
+ struct ip_vs_cpu_stats *c;
struct ip_vs_stats *s;
u64 rate;
- struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
- spin_lock(&ipvs->est_lock);
- list_for_each_entry(e, &ipvs->est_list, list) {
+ hlist_for_each_entry_rcu(e, chain, list) {
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+ u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+ u64 kinbytes = 0, koutbytes = 0;
+ unsigned int start;
+ int i;
+
+ if (kthread_should_stop())
+ break;
+
s = container_of(e, struct ip_vs_stats, est);
+ for_each_possible_cpu(i) {
+ c = per_cpu_ptr(s->cpustats, i);
+ do {
+ start = u64_stats_fetch_begin(&c->syncp);
+ conns = u64_stats_read(&c->cnt.conns);
+ inpkts = u64_stats_read(&c->cnt.inpkts);
+ outpkts = u64_stats_read(&c->cnt.outpkts);
+ inbytes = u64_stats_read(&c->cnt.inbytes);
+ outbytes = u64_stats_read(&c->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&c->syncp, start));
+ kconns += conns;
+ kinpkts += inpkts;
+ koutpkts += outpkts;
+ kinbytes += inbytes;
+ koutbytes += outbytes;
+ }
spin_lock(&s->lock);
- ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+ s->kstats.conns = kconns;
+ s->kstats.inpkts = kinpkts;
+ s->kstats.outpkts = koutpkts;
+ s->kstats.inbytes = kinbytes;
+ s->kstats.outbytes = koutbytes;
/* scaled by 2^10, but divided 2 seconds */
rate = (s->kstats.conns - e->last_conns) << 9;
@@ -130,28 +141,759 @@ static void estimation_timer(struct timer_list *t)
e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
spin_unlock(&s->lock);
}
- spin_unlock(&ipvs->est_lock);
- mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
{
- struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ int cid;
+
+ rcu_read_lock();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ goto out;
+ for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+ if (kthread_should_stop())
+ break;
+ ip_vs_chain_estimation(&td->chains[cid]);
+ cond_resched_rcu();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ break;
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static int ip_vs_estimation_kthread(void *data)
+{
+ struct ip_vs_est_kt_data *kd = data;
+ struct netns_ipvs *ipvs = kd->ipvs;
+ int row = kd->est_row;
+ unsigned long now;
+ int id = kd->id;
+ long gap;
+
+ if (id > 0) {
+ if (!ipvs->est_chain_max)
+ return 0;
+ } else {
+ if (!ipvs->est_chain_max) {
+ ipvs->est_calc_phase = 1;
+ /* commit est_calc_phase before reading est_genid */
+ smp_mb();
+ }
+
+ /* kthread 0 will handle the calc phase */
+ if (ipvs->est_calc_phase)
+ ip_vs_est_calc_phase(ipvs);
+ }
- INIT_LIST_HEAD(&est->list);
+ while (1) {
+ if (!id && !hlist_empty(&ipvs->est_temp_list))
+ ip_vs_est_drain_temp_list(ipvs);
+ set_current_state(TASK_IDLE);
+ if (kthread_should_stop())
+ break;
+
+ /* before estimation, check if we should sleep */
+ now = jiffies;
+ gap = kd->est_timer - now;
+ if (gap > 0) {
+ if (gap > IPVS_EST_TICK) {
+ kd->est_timer = now - IPVS_EST_TICK;
+ gap = IPVS_EST_TICK;
+ }
+ schedule_timeout(gap);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ if (gap < -8 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ }
- spin_lock_bh(&ipvs->est_lock);
- list_add(&est->list, &ipvs->est_list);
- spin_unlock_bh(&ipvs->est_lock);
+ if (kd->tick_len[row])
+ ip_vs_tick_estimation(kd, row);
+
+ row++;
+ if (row >= IPVS_EST_NTICKS)
+ row = 0;
+ WRITE_ONCE(kd->est_row, row);
+ kd->est_timer += IPVS_EST_TICK;
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
}
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+ /* Ignore reloads before first service is added */
+ if (!READ_ONCE(ipvs->enable))
+ return;
+ ip_vs_est_stopped_recalc(ipvs);
+ /* Bump the kthread configuration genid */
+ atomic_inc(&ipvs->est_genid);
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ unsigned long now;
+ int ret = 0;
+ long gap;
+
+ lockdep_assert_held(&ipvs->est_mutex);
+
+ if (kd->task)
+ goto out;
+ now = jiffies;
+ gap = kd->est_timer - now;
+ /* Sync est_timer if task is starting later */
+ if (abs(gap) > 4 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+ ipvs->gen, kd->id);
+ if (IS_ERR(kd->task)) {
+ ret = PTR_ERR(kd->task);
+ kd->task = NULL;
+ goto out;
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+ if (sysctl_est_preferred_cpulist(ipvs))
+ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+
+out:
+ return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+ if (kd->task) {
+ pr_info("stopping estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ kd->task = NULL;
+ }
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ kd->chain_max = ipvs->est_chain_max;
+ /* We are using single chain on RCU preemption */
+ if (IPVS_EST_TICK_CHAINS == 1)
+ kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+ kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+ kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ int id = ipvs->est_kt_count;
+ int ret = -ENOMEM;
+ void *arr = NULL;
+ int i;
+
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
+ return -EINVAL;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ for (i = 0; i < id; i++) {
+ if (!ipvs->est_kt_arr[i])
+ break;
+ }
+ if (i >= id) {
+ arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+ sizeof(struct ip_vs_est_kt_data *),
+ GFP_KERNEL);
+ if (!arr)
+ goto out;
+ ipvs->est_kt_arr = arr;
+ } else {
+ id = i;
+ }
+
+ kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+ if (!kd)
+ goto out;
+ kd->ipvs = ipvs;
+ bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+ kd->est_timer = jiffies;
+ kd->id = id;
+ ip_vs_est_set_params(ipvs, kd);
+
+ /* Pre-allocate stats used in calc phase */
+ if (!id && !kd->calc_stats) {
+ kd->calc_stats = ip_vs_stats_alloc();
+ if (!kd->calc_stats)
+ goto out;
+ }
+
+ /* Start kthread tasks only when services are present */
+ if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
+ ret = ip_vs_est_kthread_start(ipvs, kd);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (arr)
+ ipvs->est_kt_count++;
+ ipvs->est_kt_arr[id] = kd;
+ kd = NULL;
+ /* Use most recent kthread for new ests */
+ ipvs->est_add_ktid = id;
+ ret = 0;
+
+out:
+ mutex_unlock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+
+ return ret;
+}
+
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+ int ktid, best = ipvs->est_kt_count;
+ struct ip_vs_est_kt_data *kd;
+
+ for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+ kd = ipvs->est_kt_arr[ktid];
+ if (kd) {
+ if (kd->est_count < kd->est_max_count) {
+ best = ktid;
+ break;
+ }
+ } else if (ktid < best) {
+ best = ktid;
+ }
+ }
+ ipvs->est_add_ktid = best;
+}
+
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+ struct ip_vs_estimator *est)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ struct ip_vs_est_tick_data *td;
+ int ktid, row, crow, cid, ret;
+ int delay = est->ktrow;
+
+ BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+ "Too many chains for ktcid");
+
+ if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+ if (kd)
+ goto add_est;
+ }
+
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret < 0)
+ goto out;
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+ ktid = kd->id;
+ /* For small number of estimators prefer to use few ticks,
+ * otherwise try to add into the last estimated row.
+ * est_row and add_row point after the row we should use
+ */
+ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+ crow = READ_ONCE(kd->est_row);
+ else
+ crow = kd->add_row;
+ crow += delay;
+ if (crow >= IPVS_EST_NTICKS)
+ crow -= IPVS_EST_NTICKS;
+ /* Assume initial delay ? */
+ if (delay >= IPVS_EST_NTICKS - 1) {
+ /* Preserve initial delay or decrease it if no space in tick */
+ row = crow;
+ if (crow < IPVS_EST_NTICKS - 1) {
+ crow++;
+ row = find_last_bit(kd->avail, crow);
+ }
+ if (row >= crow)
+ row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+ } else {
+ /* Preserve delay or increase it if no space in tick */
+ row = IPVS_EST_NTICKS;
+ if (crow > 0)
+ row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+ if (row >= IPVS_EST_NTICKS)
+ row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+ }
+
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td) {
+ td = kzalloc(sizeof(*td), GFP_KERNEL);
+ if (!td) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(kd->ticks[row], td);
+ }
+
+ cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+ kd->est_count++;
+ kd->tick_len[row]++;
+ if (!td->chain_len[cid])
+ __set_bit(cid, td->present);
+ td->chain_len[cid]++;
+ est->ktid = ktid;
+ est->ktrow = row;
+ est->ktcid = cid;
+ hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+ if (td->chain_len[cid] >= kd->chain_max) {
+ __set_bit(cid, td->full);
+ if (kd->tick_len[row] >= kd->tick_max)
+ __clear_bit(row, kd->avail);
+ }
+
+ /* Update est_add_ktid to point to first available/empty kt slot */
+ if (kd->est_count == kd->est_max_count)
+ ip_vs_est_update_ktid(ipvs);
+
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ int ret;
+
+ if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
+ est->ktid = -1;
+ est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
+
+ /* We prefer this code to be short, kthread 0 will requeue the
+ * estimator to available chain. If tasks are disabled, we
+ * will not allocate much memory, just for kt 0.
+ */
+ ret = 0;
+ if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret >= 0)
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ else
+ INIT_HLIST_NODE(&est->list);
+ return ret;
+}
+
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+ if (kd) {
+ if (kd->task) {
+ pr_info("stop unused estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ }
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+}
+
+/* Unlink estimator from chain */
void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ int ktid = est->ktid;
+ int row = est->ktrow;
+ int cid = est->ktcid;
+
+ /* Failed to add to chain ? */
+ if (hlist_unhashed(&est->list))
+ return;
+
+ /* On return, estimator can be freed, dequeue it now */
+
+ /* In est_temp_list ? */
+ if (ktid < 0) {
+ hlist_del(&est->list);
+ goto end_kt0;
+ }
+
+ hlist_del_rcu(&est->list);
+ kd = ipvs->est_kt_arr[ktid];
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ __clear_bit(cid, td->full);
+ td->chain_len[cid]--;
+ if (!td->chain_len[cid])
+ __clear_bit(cid, td->present);
+ kd->tick_len[row]--;
+ __set_bit(row, kd->avail);
+ if (!kd->tick_len[row]) {
+ RCU_INIT_POINTER(kd->ticks[row], NULL);
+ kfree_rcu(td, rcu_head);
+ }
+ kd->est_count--;
+ if (kd->est_count) {
+ /* This kt slot can become available just now, prefer it */
+ if (ktid < ipvs->est_add_ktid)
+ ipvs->est_add_ktid = ktid;
+ return;
+ }
+
+ if (ktid > 0) {
+ mutex_lock(&ipvs->est_mutex);
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[ktid] = NULL;
+ if (ktid == ipvs->est_kt_count - 1) {
+ ipvs->est_kt_count--;
+ while (ipvs->est_kt_count > 1 &&
+ !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+ ipvs->est_kt_count--;
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* This slot is now empty, prefer another available kt slot */
+ if (ktid == ipvs->est_add_ktid)
+ ip_vs_est_update_ktid(ipvs);
+ }
+
+end_kt0:
+ /* kt 0 is freed after all other kthreads and chains are empty */
+ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+ kd = ipvs->est_kt_arr[0];
+ if (!kd || !kd->est_count) {
+ mutex_lock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[0] = NULL;
+ }
+ ipvs->est_kt_count--;
+ mutex_unlock(&ipvs->est_mutex);
+ ipvs->est_add_ktid = 0;
+ }
+ }
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_estimator *est;
+
+ while (1) {
+ int max = 16;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ while (max-- > 0) {
+ est = hlist_entry_safe(ipvs->est_temp_list.first,
+ struct ip_vs_estimator, list);
+ if (est) {
+ if (kthread_should_stop())
+ goto unlock;
+ hlist_del_init(&est->list);
+ if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+ continue;
+ est->ktid = -1;
+ hlist_add_head(&est->list,
+ &ipvs->est_temp_list);
+ /* Abort, some entries will not be estimated
+ * until next attempt
+ */
+ }
+ goto unlock;
+ }
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ }
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct ip_vs_est_kt_data *kd;
+ struct hlist_head chain;
+ struct ip_vs_stats *s;
+ int cache_factor = 4;
+ int i, loops, ntest;
+ s32 min_est = 0;
+ ktime_t t1, t2;
+ int max = 8;
+ int ret = 1;
+ s64 diff;
+ u64 val;
+
+ INIT_HLIST_HEAD(&chain);
+ mutex_lock(&__ip_vs_mutex);
+ kd = ipvs->est_kt_arr[0];
+ mutex_unlock(&__ip_vs_mutex);
+ s = kd ? kd->calc_stats : NULL;
+ if (!s)
+ goto out;
+ hlist_add_head(&s->est.list, &chain);
+
+ loops = 1;
+ /* Get best result from many tests */
+ for (ntest = 0; ntest < 12; ntest++) {
+ if (!(ntest & 3)) {
+ /* Wait for cpufreq frequency transition */
+ wait_event_idle_timeout(wq, kthread_should_stop(),
+ HZ / 50);
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto stop;
+ }
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ /* Put stats in cache */
+ ip_vs_chain_estimation(&chain);
+
+ t1 = ktime_get();
+ for (i = loops * cache_factor; i > 0; i--)
+ ip_vs_chain_estimation(&chain);
+ t2 = ktime_get();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto stop;
+ cond_resched();
+
+ diff = ktime_to_ns(ktime_sub(t2, t1));
+ if (diff <= 1 * NSEC_PER_USEC) {
+ /* Do more loops on low time resolution */
+ loops *= 2;
+ continue;
+ }
+ if (diff >= NSEC_PER_SEC)
+ continue;
+ val = diff;
+ do_div(val, loops);
+ if (!min_est || val < min_est) {
+ min_est = val;
+ /* goal: 95usec per chain */
+ val = 95 * NSEC_PER_USEC;
+ if (val >= min_est) {
+ do_div(val, min_est);
+ max = (int)val;
+ } else {
+ max = 1;
+ }
+ }
+ }
+
+out:
+ if (s)
+ hlist_del_init(&s->est.list);
+ *chain_max = max;
+ return ret;
+
+stop:
+ ret = 0;
+ goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP ECM Insert Chain enable Description
+ * ---------------------------------------------------------------------------
+ * 0 0 est_temp_list 0 create kt #0 context
+ * 0 0 est_temp_list 0->1 service added, start kthread #0 task
+ * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
+ * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
+ * stop tasks, move ests to est_temp_list
+ * and free kd for kthreads 1..last
+ * 1->0 0->N kt chains 1 ests can go to kthreads
+ * 0 N kt chains 1 drain est_temp_list, create new kthread
+ * contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+ int genid = atomic_read(&ipvs->est_genid);
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ struct ip_vs_estimator *est;
+ struct ip_vs_stats *stats;
+ int id, row, cid, delay;
+ bool last, last_td;
+ int chain_max;
+ int step;
+
+ if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+ return;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Stop all other tasks, so that we can immediately move the
+ * estimators to est_temp_list without RCU grace period
+ */
+ mutex_lock(&ipvs->est_mutex);
+ for (id = 1; id < ipvs->est_kt_count; id++) {
+ /* netns clean up started, abort */
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock2;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ continue;
+ ip_vs_est_kthread_stop(kd);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* Move all estimators to est_temp_list but carefully,
+ * all estimators and kthread data can be released while
+ * we reschedule. Even for kthread 0.
+ */
+ step = 0;
+
+ /* Order entries in est_temp_list in ascending delay, so now
+ * walk delay(desc), id(desc), cid(asc)
+ */
+ delay = IPVS_EST_NTICKS;
+
+next_delay:
+ delay--;
+ if (delay < 0)
+ goto end_dequeue;
+
+last_kt:
+ /* Destroy contexts backwards */
+ id = ipvs->est_kt_count;
+
+next_kt:
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto unlock;
+ id--;
+ if (id < 0)
+ goto next_delay;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ goto next_kt;
+ /* kt 0 can exist with empty chains */
+ if (!id && kd->est_count <= 1)
+ goto next_delay;
+
+ row = kd->est_row + delay;
+ if (row >= IPVS_EST_NTICKS)
+ row -= IPVS_EST_NTICKS;
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td)
+ goto next_kt;
+
+ cid = 0;
+
+walk_chain:
+ if (kthread_should_stop())
+ goto unlock;
+ step++;
+ if (!(step & 63)) {
+ /* Give chance estimators to be added (to est_temp_list)
+ * and deleted (releasing kthread contexts)
+ */
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Current kt released ? */
+ if (id >= ipvs->est_kt_count)
+ goto last_kt;
+ if (kd != ipvs->est_kt_arr[id])
+ goto next_kt;
+ /* Current td released ? */
+ if (td != rcu_dereference_protected(kd->ticks[row], 1))
+ goto next_kt;
+ /* No fatal changes on the current kd and td */
+ }
+ est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+ list);
+ if (!est) {
+ cid++;
+ if (cid >= IPVS_EST_TICK_CHAINS)
+ goto next_kt;
+ goto walk_chain;
+ }
+ /* We can cheat and increase est_count to protect kt 0 context
+ * from release but we prefer to keep the last estimator
+ */
+ last = kd->est_count <= 1;
+ /* Do not free kt #0 data */
+ if (!id && last)
+ goto next_delay;
+ last_td = kd->tick_len[row] <= 1;
+ stats = container_of(est, struct ip_vs_stats, est);
+ ip_vs_stop_estimator(ipvs, stats);
+ /* Tasks are stopped, move without RCU grace period */
+ est->ktid = -1;
+ est->ktrow = row - kd->est_row;
+ if (est->ktrow < 0)
+ est->ktrow += IPVS_EST_NTICKS;
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ /* kd freed ? */
+ if (last)
+ goto next_kt;
+ /* td freed ? */
+ if (last_td)
+ goto next_kt;
+ goto walk_chain;
+
+end_dequeue:
+ /* All estimators removed while calculating ? */
+ if (!ipvs->est_kt_count)
+ goto unlock;
+ kd = ipvs->est_kt_arr[0];
+ if (!kd)
+ goto unlock;
+ kd->add_row = kd->est_row;
+ ipvs->est_chain_max = chain_max;
+ ip_vs_est_set_params(ipvs, kd);
+
+ pr_info("using max %d ests per chain, %d per kthread\n",
+ kd->chain_max, kd->est_max_count);
+
+ /* Try to keep tot_stats in kt0, enqueue it early */
+ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+ ipvs->tot_stats->s.est.ktid == -1) {
+ hlist_del(&ipvs->tot_stats->s.est.list);
+ hlist_add_head(&ipvs->tot_stats->s.est.list,
+ &ipvs->est_temp_list);
+ }
+
+ mutex_lock(&ipvs->est_mutex);
- spin_lock_bh(&ipvs->est_lock);
- list_del(&est->list);
- spin_unlock_bh(&ipvs->est_lock);
+ /* We completed the calc phase, new calc phase not requested */
+ if (genid == atomic_read(&ipvs->est_genid))
+ ipvs->est_calc_phase = 0;
+
+unlock2:
+ mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -186,14 +928,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
- INIT_LIST_HEAD(&ipvs->est_list);
- spin_lock_init(&ipvs->est_lock);
- timer_setup(&ipvs->est_timer, estimation_timer, 0);
- mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ INIT_HLIST_HEAD(&ipvs->est_temp_list);
+ ipvs->est_kt_arr = NULL;
+ ipvs->est_max_threads = 0;
+ ipvs->est_calc_phase = 0;
+ ipvs->est_chain_max = 0;
+ ipvs->est_kt_count = 0;
+ ipvs->est_add_ktid = 0;
+ atomic_set(&ipvs->est_genid, 0);
+ atomic_set(&ipvs->est_genid_done, 0);
+ __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
return 0;
}
void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
{
- del_timer_sync(&ipvs->est_timer);
+ int i;
+
+ for (i = 0; i < ipvs->est_kt_count; i++)
+ ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+ kfree(ipvs->est_kt_arr);
+ mutex_destroy(&ipvs->est_mutex);
}
diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c
index b846cc385279..d657b47c6511 100644
--- a/net/netfilter/ipvs/ip_vs_fo.c
+++ b/net/netfilter/ipvs/ip_vs_fo.c
@@ -8,8 +8,7 @@
* Kenny Mathis : added initial functionality based on weight
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -72,3 +71,4 @@ static void __exit ip_vs_fo_cleanup(void)
module_init(ip_vs_fo_init);
module_exit(ip_vs_fo_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted failover scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index cf925906f59b..b315c608fda4 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -16,8 +16,7 @@
* Author: Wouter Gadeyne
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -35,7 +34,7 @@
#include <linux/gfp.h>
#include <net/protocol.h>
#include <net/tcp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/ip_vs.h>
@@ -53,6 +52,7 @@ enum {
IP_VS_FTP_EPSV,
};
+static bool exiting_module;
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
@@ -591,8 +591,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %u\n",
- app->name, i, ports[i]);
}
return 0;
@@ -607,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- if (!ipvs)
+ if (!ipvs || !exiting_module)
return;
unregister_ip_vs_app(ipvs, &ip_vs_ftp);
@@ -629,6 +627,7 @@ static int __init ip_vs_ftp_init(void)
*/
static void __exit ip_vs_ftp_exit(void)
{
+ exiting_module = true;
unregister_pernet_subsys(&ip_vs_ftp_ops);
/* rcu_barrier() is called by netns */
}
@@ -637,3 +636,4 @@ static void __exit ip_vs_ftp_exit(void)
module_init(ip_vs_ftp_init);
module_exit(ip_vs_ftp_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs ftp helper");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 7ac7473e3804..e6c8ed0c92f6 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -34,8 +34,7 @@
* me to write this module.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -123,7 +122,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -293,7 +291,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblc_check_expire(struct timer_list *t)
{
- struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -384,7 +383,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
struct ip_vs_lblc_table *tbl = svc->sched_data;
/* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
+ timer_shutdown_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblc_flush(svc);
@@ -550,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
static int __net_init __ip_vs_lblc_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -563,15 +563,16 @@ static int __net_init __ip_vs_lblc_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblc_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblc_ctl_table = vs_vars_table;
ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
- ipvs->lblc_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblc_ctl_table,
+ vars_table_size);
if (!ipvs->lblc_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblc_ctl_table);
@@ -628,3 +629,4 @@ static void __exit ip_vs_lblc_cleanup(void)
module_init(ip_vs_lblc_init);
module_exit(ip_vs_lblc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 77c323c36a88..a25cf7bb6185 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -32,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/module.h>
@@ -294,7 +293,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -457,7 +455,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblcr_check_expire(struct timer_list *t)
{
- struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -547,7 +546,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
struct ip_vs_lblcr_table *tbl = svc->sched_data;
/* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
+ timer_shutdown_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblcr_flush(svc);
@@ -736,6 +735,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
static int __net_init __ip_vs_lblcr_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -749,14 +749,15 @@ static int __net_init __ip_vs_lblcr_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblcr_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblcr_ctl_table = vs_vars_table;
ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
- ipvs->lblcr_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblcr_ctl_table,
+ vars_table_size);
if (!ipvs->lblcr_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblcr_ctl_table);
@@ -813,3 +814,4 @@ static void __exit ip_vs_lblcr_cleanup(void)
module_init(ip_vs_lblcr_init);
module_exit(ip_vs_lblcr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 9d34d81fc6f1..38cc38c5d8bb 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -9,8 +9,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -86,3 +85,4 @@ static void __exit ip_vs_lc_cleanup(void)
module_init(ip_vs_lc_init);
module_exit(ip_vs_lc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index da0280cec506..f61f54004c9e 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -174,8 +173,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
return 0;
}
- table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
- sizeof(unsigned long), GFP_KERNEL);
+ table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -227,7 +225,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
}
out:
- kfree(table);
+ bitmap_free(table);
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 08adcb222986..81974f69e5bb 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -30,8 +30,7 @@
* PASV response can not be NAT-ed) but Active FTP should work
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/types.h>
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index f56862a87518..ada158c610ce 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -26,8 +26,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -136,3 +135,4 @@ static void __exit ip_vs_nq_cleanup(void)
module_init(ip_vs_nq_init);
module_exit(ip_vs_nq_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs never queue scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c
index c03066fdd5ca..c5c67df80a0b 100644
--- a/net/netfilter/ipvs/ip_vs_ovf.c
+++ b/net/netfilter/ipvs/ip_vs_ovf.c
@@ -12,8 +12,7 @@
* active connections
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -79,3 +78,4 @@ static void __exit ip_vs_ovf_cleanup(void)
module_init(ip_vs_ovf_init);
module_exit(ip_vs_ovf_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs overflow connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 166c669f0763..3035079ebd99 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 0ac6705a61d3..85f31d71e29a 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -185,3 +184,4 @@ static void __exit ip_vs_sip_cleanup(void)
module_init(ip_vs_sip_init);
module_exit(ip_vs_sip_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs sip helper");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index f100da4ba3bc..fd9dbca24c85 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -8,8 +8,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -340,7 +339,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_protocol_init(void)
{
- char protocols[64];
+ char protocols[64] = { 0 };
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
@@ -348,8 +347,6 @@ int __init ip_vs_protocol_init(void)
strcat(protocols, (p)->name); \
} while (0)
- protocols[0] = '\0';
- protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 89602c16f6b6..44e14acc187e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -6,8 +6,7 @@
* Wensong Zhang <wensong@linuxvirtualserver.org>
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index a0921adc31a9..83e452916403 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -126,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (sctph->source != cp->vport || payload_csum ||
skb->ip_summed == CHECKSUM_PARTIAL) {
sctph->source = cp->vport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
@@ -174,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
(skb->ip_summed == CHECKSUM_PARTIAL &&
!(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
sctph->dest = cp->dport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 32b028853a7c..f68a1533ee45 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -13,8 +13,7 @@
* protocol ip_vs_proto_data and is handled by netns
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -315,7 +314,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -539,8 +538,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
- IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
- "%s:%d state: %s->%s conn->refcnt:%d\n",
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
+ "d:%s:%d state: %s->%s conn->refcnt:%d\n",
pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
@@ -548,10 +547,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
th->fin ? 'F' : '.',
th->ack ? 'A' : '.',
th->rst ? 'R' : '.',
- IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
- ntohs(cp->dport),
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
refcount_read(&cp->refcnt));
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 153d89647c87..0f0107c80dd2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,8 +9,7 @@
* Network name space (netns) aware.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
@@ -318,7 +317,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index 38495c6f6c7c..4125ee561cdc 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -14,8 +14,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -122,4 +121,5 @@ static void __exit ip_vs_rr_cleanup(void)
module_init(ip_vs_rr_init);
module_exit(ip_vs_rr_cleanup);
+MODULE_DESCRIPTION("ipvs round-robin scheduler");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index d4903723be7e..c6e421c4e299 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -12,8 +12,7 @@
* Changes:
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 7663288e5358..245a323c84cd 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -30,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -137,3 +136,4 @@ static void __exit ip_vs_sed_cleanup(void)
module_init(ip_vs_sed_init);
module_exit(ip_vs_sed_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs shortest expected delay scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index c2028e412092..0e85e07e23b9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -32,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -376,3 +375,4 @@ static void __exit ip_vs_sh_cleanup(void)
module_init(ip_vs_sh_init);
module_exit(ip_vs_sh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs source hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 605e0f68f8bd..54dd1514ac45 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -32,8 +32,7 @@
* Persistence support, fwmark and time-out.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/slab.h>
@@ -51,7 +50,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
-#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */
#include <net/ip.h>
#include <net/sock.h>
@@ -242,9 +241,6 @@ struct ip_vs_sync_thread_data {
| IPVS Sync Connection (1) |
*/
-#define SYNC_MESG_HEADER_LEN 4
-#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
/* Version 0 header */
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
@@ -606,7 +602,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
struct ip_vs_sync_conn_options *opt =
(struct ip_vs_sync_conn_options *)&s[1];
- memcpy(opt, &cp->in_seq, sizeof(*opt));
+ memcpy(opt, &cp->sync_conn_opt, sizeof(*opt));
}
m->nr_conns++;
@@ -618,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
cp = cp->control;
if (cp) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -779,7 +775,7 @@ control:
if (!cp)
return;
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
goto sloop;
@@ -1283,12 +1279,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
lock_sock(sk);
if (mode) {
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
- sysctl_wmem_max);
+ READ_ONCE(sysctl_wmem_max));
sk->sk_sndbuf = val * 2;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
} else {
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
- sysctl_rmem_max);
+ READ_ONCE(sysctl_rmem_max));
sk->sk_rcvbuf = val * 2;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
@@ -1300,20 +1296,14 @@ static void set_sock_size(struct sock *sk, int mode, int val)
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
{
- struct inet_sock *inet = inet_sk(sk);
-
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
- lock_sock(sk);
- inet->mc_loop = loop ? 1 : 0;
+ inet_assign_bit(MC_LOOP, sk, loop);
#ifdef CONFIG_IP_VS_IPV6
- if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
+ if (READ_ONCE(sk->sk_family) == AF_INET6) {
/* IPV6_MULTICAST_LOOP */
- np->mc_loop = loop ? 1 : 0;
+ inet6_assign_bit(MC6_LOOP, sk, loop);
}
#endif
- release_sock(sk);
}
/*
@@ -1325,13 +1315,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
- inet->mc_ttl = ttl;
+ WRITE_ONCE(inet->mc_ttl, ttl);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_HOPS */
- np->mcast_hops = ttl;
+ WRITE_ONCE(np->mcast_hops, ttl);
}
#endif
release_sock(sk);
@@ -1344,13 +1334,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MTU_DISCOVER */
- np->pmtudisc = val;
+ WRITE_ONCE(np->pmtudisc, val);
}
#endif
release_sock(sk);
@@ -1374,7 +1364,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev)
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_IF */
- np->mcast_oif = dev->ifindex;
+ WRITE_ONCE(np->mcast_oif, dev->ifindex);
}
#endif
release_sock(sk);
@@ -1444,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
sin.sin_addr.s_addr = addr;
sin.sin_port = 0;
- return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+ return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
}
static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
@@ -1510,8 +1500,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id,
}
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
- result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
- salen, 0);
+ result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr,
+ salen, 0);
if (result < 0) {
pr_err("Error connecting to the multicast addr\n");
goto error;
@@ -1551,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id,
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
sock->sk->sk_bound_dev_if = dev->ifindex;
- result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
+ result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
goto error;
@@ -1585,13 +1575,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
struct kvec iov;
int len;
- EnterFunction(7);
iov.iov_base = (void *)buffer;
iov.iov_len = length;
len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
- LeaveFunction(7);
return len;
}
@@ -1617,15 +1605,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
struct kvec iov = {buffer, buflen};
int len;
- EnterFunction(7);
-
/* Receive a packet */
- iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen);
len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
if (len < 0)
return len;
- LeaveFunction(7);
return len;
}
@@ -1717,6 +1702,8 @@ static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = tinfo->ipvs;
+ struct sock *sk = tinfo->sock->sk;
+ struct udp_sock *up = udp_sk(sk);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
@@ -1724,12 +1711,14 @@ static int sync_thread_backup(void *data)
ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
while (!kthread_should_stop()) {
- wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
- !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
- || kthread_should_stop());
+ wait_event_interruptible(*sk_sleep(sk),
+ !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue) ||
+ kthread_should_stop());
/* do we have data now? */
- while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ while (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue)) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->bcfg.sync_maxlen);
if (len <= 0) {
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
new file mode 100644
index 000000000000..dbb7f5fd4688
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* IPVS: Power of Twos Choice Scheduling module
+ *
+ * Authors: Darby Payne <darby.payne@applovin.com>
+ */
+
+#define pr_fmt(fmt) "IPVS: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+/* Power of Twos Choice scheduling, algorithm originally described by
+ * Michael Mitzenmacher.
+ *
+ * Randomly picks two destinations and picks the one with the least
+ * amount of connections
+ *
+ * The algorithm calculates a few variables
+ * - total_weight = sum of all weights
+ * - rweight1 = random number between [0,total_weight]
+ * - rweight2 = random number between [0,total_weight]
+ *
+ * For each destination
+ * decrement rweight1 and rweight2 by the destination weight
+ * pick choice1 when rweight1 is <= 0
+ * pick choice2 when rweight2 is <= 0
+ *
+ * Return choice2 if choice2 has less connections than choice 1 normalized
+ * by weight
+ *
+ * References
+ * ----------
+ *
+ * [Mitzenmacher 2016]
+ * The Power of Two Random Choices: A Survey of Techniques and Results
+ * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman
+ * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
+ *
+ */
+static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
+ const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL;
+ int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0;
+ int overhead2, total_weight = 0, weight;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* Generate a random weight between [0,sum of all weights) */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ total_weight += weight;
+ choice1 = dest;
+ }
+ }
+ }
+
+ if (!choice1) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Add 1 to total_weight so that the random weights are inclusive
+ * from 0 to total_weight
+ */
+ total_weight += 1;
+ rweight1 = get_random_u32_below(total_weight);
+ rweight2 = get_random_u32_below(total_weight);
+
+ /* Pick two weighted servers */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ weight = atomic_read(&dest->weight);
+ if (weight <= 0)
+ continue;
+
+ rweight1 -= weight;
+ rweight2 -= weight;
+
+ if (rweight1 <= 0 && weight1 == -1) {
+ choice1 = dest;
+ weight1 = weight;
+ overhead1 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (rweight2 <= 0 && weight2 == -1) {
+ choice2 = dest;
+ weight2 = weight;
+ overhead2 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (weight1 != -1 && weight2 != -1)
+ goto nextstage;
+ }
+
+nextstage:
+ if (choice2 && (weight2 * overhead1) > (weight1 * overhead2))
+ choice1 = choice2;
+
+ IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(choice1->af, &choice1->addr),
+ ntohs(choice1->port), atomic_read(&choice1->activeconns),
+ refcount_read(&choice1->refcnt),
+ atomic_read(&choice1->weight));
+
+ return choice1;
+}
+
+static struct ip_vs_scheduler ip_vs_twos_scheduler = {
+ .name = "twos",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list),
+ .schedule = ip_vs_twos_schedule,
+};
+
+static int __init ip_vs_twos_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_twos_scheduler);
+}
+
+static void __exit ip_vs_twos_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_twos_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_twos_init);
+module_exit(ip_vs_twos_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs power of twos choice scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index 09f584b564a0..9da445ca09a1 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -14,8 +14,7 @@
* Wensong Zhang : added any dest with weight=0 is quiesced
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -109,3 +108,4 @@ static void __exit ip_vs_wlc_cleanup(void)
module_init(ip_vs_wlc_init);
module_exit(ip_vs_wlc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 1bc7a0789d85..99f09cbf2d9b 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -13,8 +13,7 @@
* with weight 0 when all weights are zero
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -263,3 +262,4 @@ static void __exit ip_vs_wrr_cleanup(void)
module_init(ip_vs_wrr_init);
module_exit(ip_vs_wrr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted round-robin scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b00866d777fe..3162ce3c2640 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -21,8 +21,7 @@
* - the only place where we can see skb->sk != NULL
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -97,7 +96,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
if (!dest_dst)
return NULL;
dst = dest_dst->dst_cache;
- if (dst->obsolete &&
+ if (READ_ONCE(dst->obsolete) &&
dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
return dest_dst;
@@ -119,13 +118,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
return false;
}
-/* Get route to daddr, update *saddr, optionally bind route to saddr */
+/* Get route to daddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *ret_saddr)
{
struct flowi4 fl4;
struct rtable *rt;
- bool loop = false;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -135,23 +133,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
retry:
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
- /* Invalid saddr ? */
- if (PTR_ERR(rt) == -EINVAL && *saddr &&
- rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
- *saddr = 0;
- flowi4_update_output(&fl4, 0, 0, daddr, 0);
- goto retry;
- }
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
return NULL;
- } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ }
+ if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
- *saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
- loop = true;
+ flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
+ rt_mode = 0;
goto retry;
}
- *saddr = fl4.saddr;
+ if (ret_saddr)
+ *ret_saddr = fl4.saddr;
return rt;
}
@@ -180,7 +172,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
(addr_type & IPV6_ADDR_LOOPBACK);
old_rt_is_local = __ip_vs_is_local_route6(
- (struct rt6_info *)skb_dst(skb));
+ dst_rt6_info(skb_dst(skb)));
} else
#endif
{
@@ -271,7 +263,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
@@ -286,7 +278,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
{
if (ip_hdr(skb)->ttl <= 1) {
/* Tell the sender its packet died... */
- __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
return false;
}
@@ -318,7 +310,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rtable *) dest_dst->dst_cache;
+ rt = dst_rtable(dest_dst->dst_cache);
else {
dest_dst = ip_vs_dest_dst_alloc();
spin_lock_bh(&dest->dst_lock);
@@ -339,24 +331,20 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest_dst->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.ip;
} else {
- __be32 saddr = htonl(INADDR_ANY);
-
noref = 0;
/* For such unconfigured boxes avoid many route lookups
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
if (!rt)
goto err_unreach;
- if (ret_saddr)
- *ret_saddr = saddr;
}
local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
@@ -390,10 +378,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
skb->ip_summed == CHECKSUM_PARTIAL)
mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
mtu -= gre_calc_hlen(tflags);
}
if (mtu < 68) {
@@ -481,7 +469,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rt6_info *) dest_dst->dst_cache;
+ rt = dst_rt6_info(dest_dst->dst_cache);
else {
u32 cookie;
@@ -501,13 +489,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
ip_vs_dest_dst_free(dest_dst);
goto err_unreach;
}
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest_dst->dst_saddr.in6,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.in6;
@@ -517,7 +505,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
rt_mode);
if (!dst)
goto err_unreach;
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
}
local = __ip_vs_is_local_route6(rt);
@@ -553,10 +541,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
skb->ip_summed == CHECKSUM_PARTIAL)
mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
mtu -= gre_calc_hlen(tflags);
}
if (mtu < IPV6_MIN_MTU) {
@@ -609,6 +597,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset_ct(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
}
return ret;
}
@@ -649,6 +639,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -669,6 +661,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -700,8 +694,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct iphdr *iph = ip_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
goto tx_error;
@@ -713,12 +705,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -729,8 +719,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
&iph->daddr, NULL,
ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
@@ -741,12 +729,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -762,8 +748,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int local, rc, was_input;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
@@ -833,12 +817,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -850,8 +832,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
int local, rc;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
@@ -870,7 +850,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_RDR);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -921,11 +901,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
return NF_STOLEN;
}
@@ -988,7 +966,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
old_dsfield = ipv4_get_dsfield(old_iph);
*ttl = old_iph->ttl;
if (payload_len)
- *payload_len = ntohs(old_iph->tot_len);
+ *payload_len = skb_ip_totlen(skb);
}
/* Implement full-functionality option for ECN encapsulation */
@@ -1092,11 +1070,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb,
{
__be16 proto = *next_protocol == IPPROTO_IPIP ?
htons(ETH_P_IP) : htons(ETH_P_IPV6);
- __be16 tflags = 0;
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t hdrlen;
if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
hdrlen = gre_calc_hlen(tflags);
gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
@@ -1143,8 +1121,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
int tun_type, gso_type;
int tun_flags;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1177,11 +1153,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom += sizeof(struct udphdr) + gue_hdrlen;
} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t gre_hdrlen;
- __be16 tflags = 0;
if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
gre_hdrlen = gre_calc_hlen(tflags);
max_headroom += gre_hdrlen;
@@ -1193,7 +1169,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
&next_protocol, NULL, &dsfield,
&ttl, dfp);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
gso_type = __tun_gso_type_mask(AF_INET, cp->af);
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1219,6 +1195,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->transport_header = skb->network_header;
skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
bool check = false;
@@ -1261,14 +1238,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
@@ -1292,8 +1265,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int tun_type, gso_type;
int tun_flags;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
&saddr, ipvsh, 1,
@@ -1305,7 +1276,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (local)
return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
tdev = rt->dst.dev;
/*
@@ -1327,11 +1298,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
max_headroom += sizeof(struct udphdr) + gue_hdrlen;
} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
size_t gre_hdrlen;
- __be16 tflags = 0;
if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
- tflags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
gre_hdrlen = gre_calc_hlen(tflags);
max_headroom += gre_hdrlen;
@@ -1341,7 +1312,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
&next_protocol, &payload_len,
&dsfield, &ttl, NULL);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1367,6 +1338,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb->transport_header = skb->network_header;
skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
bool check = false;
@@ -1408,14 +1380,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
#endif
@@ -1431,8 +1399,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1449,12 +1415,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -1465,8 +1429,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
NULL, ipvsh, 0,
@@ -1483,12 +1445,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -1508,8 +1468,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode, was_input;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1520,7 +1478,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1576,14 +1534,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
- out:
- LeaveFunction(10);
return rc;
}
@@ -1598,8 +1553,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1610,7 +1563,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1625,7 +1578,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
&cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -1665,14 +1618,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
-out:
- LeaveFunction(10);
return rc;
}
#endif
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..46e667a50d98
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <uapi/linux/netfilter_ipv4.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
+ const struct nf_hook_state *s)
+{
+ const struct bpf_prog *prog = bpf_prog;
+ struct bpf_nf_ctx ctx = {
+ .state = s,
+ .skb = skb,
+ };
+
+ return bpf_prog_run_pin_on_cpu(prog, &ctx);
+}
+
+struct bpf_nf_link {
+ struct bpf_link link;
+ struct nf_hook_ops hook_ops;
+ netns_tracker ns_tracker;
+ struct net *net;
+ u32 dead;
+ const struct nf_defrag_hook *defrag_hook;
+};
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+static const struct nf_defrag_hook *
+get_proto_defrag_hook(struct bpf_nf_link *link,
+ const struct nf_defrag_hook __rcu **ptr_global_hook,
+ const char *mod)
+{
+ const struct nf_defrag_hook *hook;
+ int err;
+
+ /* RCU protects us from races against module unloading */
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ if (!hook) {
+ rcu_read_unlock();
+ err = request_module("%s", mod);
+ if (err)
+ return ERR_PTR(err < 0 ? err : -EINVAL);
+
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ }
+
+ if (hook && try_module_get(hook->owner)) {
+ /* Once we have a refcnt on the module, we no longer need RCU */
+ hook = rcu_pointer_handoff(hook);
+ } else {
+ WARN_ONCE(!hook, "%s has bad registration", mod);
+ hook = ERR_PTR(-ENOENT);
+ }
+ rcu_read_unlock();
+
+ if (!IS_ERR(hook)) {
+ err = hook->enable(link->net);
+ if (err) {
+ module_put(hook->owner);
+ hook = ERR_PTR(err);
+ }
+ }
+
+ return hook;
+}
+#endif
+
+static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook __maybe_unused *hook;
+
+ switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+ case NFPROTO_IPV4:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ case NFPROTO_IPV6:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+ default:
+ return -EAFNOSUPPORT;
+ }
+}
+
+static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook *hook = link->defrag_hook;
+
+ if (!hook)
+ return;
+ hook->disable(link->net);
+ module_put(hook->owner);
+}
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ if (nf_link->dead)
+ return;
+
+ /* do not double release in case .detach was already called */
+ if (!cmpxchg(&nf_link->dead, 0, 1)) {
+ nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+ bpf_nf_disable_defrag(nf_link);
+ put_net_track(nf_link->net, &nf_link->ns_tracker);
+ }
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+ bpf_nf_link_release(link);
+ return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+ nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+ nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+ const struct nf_defrag_hook *hook = nf_link->defrag_hook;
+
+ info->netfilter.pf = nf_link->hook_ops.pf;
+ info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+ info->netfilter.priority = nf_link->hook_ops.priority;
+ info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0;
+
+ return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+ .release = bpf_nf_link_release,
+ .dealloc = bpf_nf_link_dealloc,
+ .detach = bpf_nf_link_detach,
+ .show_fdinfo = bpf_nf_link_show_info,
+ .fill_link_info = bpf_nf_link_fill_link_info,
+ .update_prog = bpf_nf_link_update,
+};
+
+static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
+{
+ int prio;
+
+ switch (attr->link_create.netfilter.pf) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
+ return -EPROTO;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
+ return -EOPNOTSUPP;
+
+ /* make sure conntrack confirm is always last */
+ prio = attr->link_create.netfilter.priority;
+ if (prio == NF_IP_PRI_FIRST)
+ return -ERANGE; /* sabotage_in and other warts */
+ else if (prio == NF_IP_PRI_LAST)
+ return -ERANGE; /* e.g. conntrack confirm */
+ else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
+ prio <= NF_IP_PRI_CONNTRACK_DEFRAG)
+ return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */
+
+ return 0;
+}
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct bpf_nf_link *link;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ err = bpf_nf_check_pf_and_hooks(attr);
+ if (err)
+ return err;
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog,
+ attr->link_create.attach_type);
+
+ link->hook_ops.hook = nf_hook_run_bpf;
+ link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+ link->hook_ops.priv = prog;
+
+ link->hook_ops.pf = attr->link_create.netfilter.pf;
+ link->hook_ops.priority = attr->link_create.netfilter.priority;
+ link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+ link->net = net;
+ link->dead = false;
+ link->defrag_hook = NULL;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
+ err = bpf_nf_enable_defrag(link);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
+ err = nf_register_net_hook(net, &link->hook_ops);
+ if (err) {
+ bpf_nf_disable_defrag(link);
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ get_net_track(net, &link->ns_tracker, GFP_KERNEL);
+
+ return bpf_link_settle(&link_primer);
+}
+
+const struct bpf_prog_ops netfilter_prog_ops = {
+ .test_run = bpf_prog_test_run_nf,
+};
+
+static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
+{
+ struct btf *btf;
+ s32 type_id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR_OR_NULL(btf))
+ return false;
+
+ type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+ if (WARN_ON_ONCE(type_id < 0))
+ return false;
+
+ info->btf = btf;
+ info->btf_id = type_id;
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ return true;
+}
+
+static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_nf_ctx, skb):
+ if (size != sizeof_field(struct bpf_nf_ctx, skb))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "sk_buff");
+ case bpf_ctx_range(struct bpf_nf_ctx, state):
+ if (size != sizeof_field(struct bpf_nf_ctx, state))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "nf_hook_state");
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static const struct bpf_func_proto *
+bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+const struct bpf_verifier_ops netfilter_verifier_ops = {
+ .is_valid_access = nf_is_valid_access,
+ .get_func_proto = bpf_nf_func_proto,
+};
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 82f36beb2e76..f1be4dd5cf85 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -122,15 +122,68 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
return ERR_PTR(-EAGAIN);
}
+static bool get_ct_or_tuple_from_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conn **ct,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone **zone,
+ bool *refcounted)
+{
+ const struct nf_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *found_ct;
+
+ found_ct = nf_ct_get(skb, &ctinfo);
+ if (found_ct && !nf_ct_is_template(found_ct)) {
+ *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ *zone = nf_ct_zone(found_ct);
+ *ct = found_ct;
+ return true;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple))
+ return false;
+
+ if (found_ct)
+ *zone = nf_ct_zone(found_ct);
+
+ h = nf_conntrack_find_get(net, *zone, tuple);
+ if (!h)
+ return true;
+
+ found_ct = nf_ct_tuplehash_to_ctrack(h);
+ *refcounted = true;
+ *ct = found_ct;
+
+ return true;
+}
+
static int __nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct = NULL;
struct nf_conn *found_ct;
unsigned int collect = 0;
+ bool refcounted = false;
+
+ if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted))
+ return -ENOENT;
+
+ if (ct && nf_ct_is_confirmed(ct)) {
+ if (refcounted)
+ nf_ct_put(ct);
+ return -EEXIST;
+ }
+
+ if ((u32)jiffies == list->last_gc)
+ goto add_new_node;
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
@@ -141,10 +194,10 @@ static int __nf_conncount_add(struct net *net,
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
- return 0; /* already exists */
+ goto out_put; /* already exists */
} else {
collect++;
}
@@ -153,7 +206,7 @@ static int __nf_conncount_add(struct net *net,
found_ct = nf_ct_tuplehash_to_ctrack(found);
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
* We should not see tuples twice unless someone hooks
@@ -162,7 +215,7 @@ static int __nf_conncount_add(struct net *net,
* Attempt to avoid a re-add in this case.
*/
nf_ct_put(found_ct);
- return 0;
+ goto out_put;
} else if (already_closed(found_ct)) {
/*
* we do not care about connections which are
@@ -177,6 +230,7 @@ static int __nf_conncount_add(struct net *net,
nf_ct_put(found_ct);
}
+add_new_node:
if (WARN_ON_ONCE(list->count > INT_MAX))
return -EOVERFLOW;
@@ -184,42 +238,48 @@ static int __nf_conncount_add(struct net *net,
if (conn == NULL)
return -ENOMEM;
- conn->tuple = *tuple;
+ conn->tuple = tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
+ list->last_gc = (u32)jiffies;
+
+out_put:
+ if (refcounted)
+ nf_ct_put(ct);
return 0;
}
-int nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+int nf_conncount_add_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
int ret;
/* check the saved connections */
spin_lock_bh(&list->list_lock);
- ret = __nf_conncount_add(net, list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, list);
spin_unlock_bh(&list->list_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_conncount_add);
+EXPORT_SYMBOL_GPL(nf_conncount_add_skb);
void nf_conncount_list_init(struct nf_conncount_list *list)
{
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
+ list->last_gc = (u32)jiffies;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
/* Return true if the list is empty. Must be called with BH disabled. */
-bool nf_conncount_gc_list(struct net *net,
- struct nf_conncount_list *list)
+static bool __nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
@@ -227,8 +287,8 @@ bool nf_conncount_gc_list(struct net *net,
unsigned int collected = 0;
bool ret = false;
- /* don't bother if other cpu is already doing GC */
- if (!spin_trylock(&list->list_lock))
+ /* don't bother if we just did GC */
+ if ((u32)jiffies == READ_ONCE(list->last_gc))
return false;
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
@@ -258,7 +318,22 @@ bool nf_conncount_gc_list(struct net *net,
if (!list->count)
ret = true;
- spin_unlock(&list->list_lock);
+ list->last_gc = (u32)jiffies;
+
+ return ret;
+}
+
+bool nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
+{
+ bool ret;
+
+ /* don't bother if other cpu is already doing GC */
+ if (!spin_trylock_bh(&list->list_lock))
+ return false;
+
+ ret = __nf_conncount_gc_list(net, list);
+ spin_unlock_bh(&list->list_lock);
return ret;
}
@@ -298,20 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
static unsigned int
insert_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
struct rb_root *root,
unsigned int hash,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ bool do_gc = true, refcounted = false;
+ unsigned int count = 0, gc_count = 0;
struct rb_node **rbnode, *parent;
- struct nf_conncount_rb *rbconn;
+ struct nf_conntrack_tuple tuple;
struct nf_conncount_tuple *conn;
- unsigned int count = 0, gc_count = 0;
- u8 keylen = data->keylen;
- bool do_gc = true;
+ struct nf_conncount_rb *rbconn;
+ struct nf_conn *ct = NULL;
spin_lock_bh(&nf_conncount_locks[hash]);
restart:
@@ -322,7 +399,7 @@ restart:
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
parent = *rbnode;
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
rbnode = &((*rbnode)->rb_left);
} else if (diff > 0) {
@@ -330,8 +407,8 @@ restart:
} else {
int ret;
- ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
- if (ret)
+ ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list);
+ if (ret && ret != -EEXIST)
count = 0; /* hotdrop */
else
count = rbconn->list.count;
@@ -354,28 +431,35 @@ restart:
goto restart;
}
- /* expected case: match, insert new node */
- rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
- if (rbconn == NULL)
- goto out_unlock;
+ if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) {
+ /* expected case: match, insert new node */
+ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+ if (rbconn == NULL)
+ goto out_unlock;
- conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
- if (conn == NULL) {
- kmem_cache_free(conncount_rb_cachep, rbconn);
- goto out_unlock;
- }
+ conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL) {
+ kmem_cache_free(conncount_rb_cachep, rbconn);
+ goto out_unlock;
+ }
- conn->tuple = *tuple;
- conn->zone = *zone;
- memcpy(rbconn->key, key, sizeof(u32) * keylen);
+ conn->tuple = tuple;
+ conn->zone = *zone;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
+ memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
- nf_conncount_list_init(&rbconn->list);
- list_add(&conn->node, &rbconn->list.head);
- count = 1;
- rbconn->list.count = count;
+ nf_conncount_list_init(&rbconn->list);
+ list_add(&conn->node, &rbconn->list.head);
+ count = 1;
+ rbconn->list.count = count;
- rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_link_node_rcu(&rbconn->node, parent, rbnode);
+ rb_insert_color(&rbconn->node, root);
+
+ if (refcounted)
+ nf_ct_put(ct);
+ }
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash]);
return count;
@@ -383,16 +467,15 @@ out_unlock:
static unsigned int
count_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct rb_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
- u8 keylen = data->keylen;
hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
root = &data->root[hash];
@@ -403,7 +486,7 @@ count_tree(struct net *net,
rbconn = rb_entry(parent, struct nf_conncount_rb, node);
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
parent = rcu_dereference_raw(parent->rb_left);
} else if (diff > 0) {
@@ -411,7 +494,7 @@ count_tree(struct net *net,
} else {
int ret;
- if (!tuple) {
+ if (!skb) {
nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
}
@@ -426,19 +509,23 @@ count_tree(struct net *net,
}
/* same source network -> be counted! */
- ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
spin_unlock_bh(&rbconn->list.list_lock);
- if (ret)
+ if (ret && ret != -EEXIST) {
return 0; /* hotdrop */
- else
+ } else {
+ /* -EEXIST means add was skipped, update the list */
+ if (ret == -EEXIST)
+ nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
+ }
}
}
- if (!tuple)
+ if (!skb)
return 0;
- return insert_tree(net, data, root, hash, key, tuple, zone);
+ return insert_tree(net, skb, l3num, data, root, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
@@ -500,24 +587,24 @@ next:
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- * Call with RCU read lock.
+ * If 'skb' is not null, insert the corresponding tuple into the accounting
+ * data structure. Call with RCU read lock.
*/
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+unsigned int nf_conncount_count_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_data *data,
+ const u32 *key)
{
- return count_tree(net, data, key, tuple, zone);
+ return count_tree(net, skb, l3num, data, key);
+
}
-EXPORT_SYMBOL_GPL(nf_conncount_count);
+EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
-struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
- unsigned int keylen)
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
- int ret, i;
+ int i;
if (keylen % sizeof(u32) ||
keylen / sizeof(u32) > MAX_KEYLEN ||
@@ -530,12 +617,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
if (!data)
return ERR_PTR(-ENOMEM);
- ret = nf_ct_netns_get(net, family);
- if (ret < 0) {
- kfree(data);
- return ERR_PTR(ret);
- }
-
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
data->root[i] = RB_ROOT;
@@ -572,13 +653,11 @@ static void destroy_tree(struct rb_root *r)
}
}
-void nf_conncount_destroy(struct net *net, unsigned int family,
- struct nf_conncount_data *data)
+void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
{
unsigned int i;
cancel_work_sync(&data->gc_work);
- nf_ct_netns_put(net, family);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
@@ -594,15 +673,11 @@ static int __init nf_conncount_modinit(void)
for (i = 0; i < CONNCOUNT_SLOTS; ++i)
spin_lock_init(&nf_conncount_locks[i]);
- conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
- sizeof(struct nf_conncount_tuple),
- 0, 0, NULL);
+ conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
- conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
- sizeof(struct nf_conncount_rb),
- 0, 0, NULL);
+ conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0);
if (!conncount_rb_cachep) {
kmem_cache_destroy(conncount_conn_cachep);
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 2ccda8ace796..385a5f458aba 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/* Accouting handling for netfilter. */
+/* Accounting handling for netfilter. */
/*
* (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
@@ -22,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-static const struct nf_ct_ext_type acct_extend = {
- .len = sizeof(struct nf_conn_acct),
- .align = __alignof__(struct nf_conn_acct),
- .id = NF_CT_EXT_ACCT,
-};
-
void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
}
-
-int nf_conntrack_acct_init(void)
-{
- int ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
- nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index d011d2eb0848..7be4c35e4795 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- nf_ct_refresh(ct, skb, master_timeout * HZ);
+ nf_ct_refresh(ct, master_timeout * HZ);
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644
index 000000000000..4a136fc3a9c0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/xdp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx (xdp_md, __sk_buff)
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - Passed NULL for bpf_tuple pointer
+ * -EINVAL - opts->reserved is not 0
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
+ * -EINVAL - opts->ct_zone_id set when
+ opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
+ * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ * -ENONET - No network namespace found for netns_id
+ * -ENOENT - Conntrack lookup could not find entry for tuple
+ * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ * or sizeof(tuple->ipv6)
+ * @l4proto - Layer 4 protocol
+ * Values:
+ * IPPROTO_TCP, IPPROTO_UDP
+ * @dir: - connection tracking tuple direction.
+ * @ct_zone_id - connection tracking zone id.
+ * @ct_zone_dir - connection tracking zone direction.
+ * @reserved - Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ */
+struct bpf_ct_opts {
+ s32 netns_id;
+ s32 error;
+ u8 l4proto;
+ u8 dir;
+ u16 ct_zone_id;
+ u8 ct_zone_dir;
+ u8 reserved[3];
+};
+
+enum {
+ NF_BPF_CT_OPTS_SZ = 16,
+};
+
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum, u8 dir,
+ struct nf_conntrack_tuple *tuple)
+{
+ union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+ union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+ union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+ : &tuple->src.u;
+ union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+ : (void *)&tuple->dst.u;
+
+ if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+ return -EPROTO;
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ switch (tuple_len) {
+ case sizeof(bpf_tuple->ipv4):
+ tuple->src.l3num = AF_INET;
+ src->ip = bpf_tuple->ipv4.saddr;
+ sport->tcp.port = bpf_tuple->ipv4.sport;
+ dst->ip = bpf_tuple->ipv4.daddr;
+ dport->tcp.port = bpf_tuple->ipv4.dport;
+ break;
+ case sizeof(bpf_tuple->ipv6):
+ tuple->src.l3num = AF_INET6;
+ memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ sport->tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ dport->tcp.port = bpf_tuple->ipv6.dport;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+
+ return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+ u32 timeout)
+{
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conntrack_zone ct_zone;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple)
+ return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &otuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_REPLY, &rtuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple,
+ GFP_ATOMIC);
+ if (IS_ERR(ct))
+ goto out;
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ __nf_ct_set_timeout(ct, timeout * HZ);
+
+out:
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts,
+ u32 opts_len)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone ct_zone;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple)
+ return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
+ if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &tuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ hash = nf_conntrack_find_get(net, &ct_zone, &tuple);
+ if (opts->netns_id >= 0)
+ put_net(net);
+ if (!hash)
+ return ERR_PTR(-ENOENT);
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ opts->dir = NF_CT_DIRECTION(hash);
+
+ return ct;
+}
+
+BTF_ID_LIST(btf_nf_conn_ids)
+BTF_ID(struct, nf_conn)
+BTF_ID(struct, nf_conn___init)
+
+/* Check writes into `struct nf_conn` */
+static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *ncit, *nct, *t;
+ size_t end;
+
+ ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t != nct && t != ncit) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ /* `struct nf_conn` and `struct nf_conn___init` have the same layout
+ * so we are safe to simply merge offset checks here
+ */
+ switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ case offsetof(struct nf_conn, mark):
+ end = offsetofend(struct nf_conn, mark);
+ break;
+#endif
+ default:
+ bpf_log(log, "no write support to nf_conn at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+__bpf_kfunc_start_defs();
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct nf_conn *nfct;
+
+ nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+ opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = dev_net(ctx->rxq->dev);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct nf_conn *nfct;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct - Pointer to referenced nf_conn___init object, obtained
+ * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+{
+ struct nf_conn *nfct = (struct nf_conn *)nfct_i;
+ int err;
+
+ if (!nf_ct_is_confirmed(nfct))
+ nfct->timeout += nfct_time_stamp;
+ nfct->status |= IPS_CONFIRMED;
+ err = nf_conntrack_hash_check_insert(nfct);
+ if (err < 0) {
+ nf_conntrack_free(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
+{
+ nf_ct_put(nfct);
+}
+
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout - Timeout in msecs.
+ */
+__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+ __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout - New timeout in msecs.
+ */
+__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+ return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status - New status value.
+ */
+__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+ return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status - New status value.
+ */
+__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+ return nf_ct_change_status_common(nfct, status);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(nf_ct_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ct_kfunc_set,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+ if (!ret) {
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
+ mutex_unlock(&nf_conn_btf_access_lock);
+ }
+
+ return ret;
+}
+
+void cleanup_nf_conntrack_bpf(void)
+{
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = NULL;
+ mutex_unlock(&nf_conn_btf_access_lock);
+}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 1ba6becc3079..a7552a46d6ac 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -20,6 +20,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
unsigned int timeout)
{
+ const struct nf_conntrack_helper *helper;
struct nf_conntrack_expect *exp;
struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);
@@ -58,7 +59,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
goto out;
exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ exp->tuple.src.u.udp.port = helper->tuple.src.u.udp.port;
exp->mask.src.u3.ip = mask;
exp->mask.src.u.udp.port = htons(0xFFFF);
@@ -71,10 +75,11 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
- nf_ct_refresh(ct, skb, timeout * HZ);
+ nf_ct_refresh(ct, timeout * HZ);
out:
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Broadcast connection tracking helper");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 79cd9dde457b..0b95f226f211 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -21,7 +21,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
@@ -35,10 +34,10 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -66,22 +65,39 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
struct conntrack_gc_work {
struct delayed_work dwork;
- u32 last_bucket;
+ u32 next_bucket;
+ u32 avg_timeout;
+ u32 count;
+ u32 start_time;
bool exiting;
bool early_drop;
- long next_gc_run;
};
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV 128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES (16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO 50u
+/* serialize hash resizes and nf_ct_iterate_cleanup */
+static DEFINE_MUTEX(nf_conntrack_mutex);
+
+#define GC_SCAN_INTERVAL_MAX (60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN (1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
+
+/* Initial bias pretending we have 100 entries at the upper bound so we don't
+ * wakeup often just because we have three entries with a 1s timeout while still
+ * allowing non-idle machines to wakeup more often when needed.
+ */
+#define GC_SCAN_INITIAL_COUNT 100
+#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX
+
+#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
+
+#define MIN_CHAINLEN 50u
+#define MAX_CHAINLEN (80u - MIN_CHAINLEN)
static struct conntrack_gc_work conntrack_gc_work;
@@ -120,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
}
/* return true if we need to recompute hashes (in case hash table was resized) */
-static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
- unsigned int h2, unsigned int sequence)
+static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2,
+ unsigned int sequence)
{
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
@@ -149,7 +165,15 @@ static void nf_conntrack_all_lock(void)
spin_lock(&nf_conntrack_locks_all_lock);
- nf_conntrack_locks_all = true;
+ /* For nf_contrack_locks_all, only the latest time when another
+ * CPU will see an update is controlled, by the "release" of the
+ * spin_lock below.
+ * The earliest time is not controlled, an thus KCSAN could detect
+ * a race when nf_conntract_lock() reads the variable.
+ * WRITE_ONCE() is used to ensure the compiler will not
+ * optimize the write.
+ */
+ WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@@ -180,26 +204,25 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
-seqcount_t nf_conntrack_generation __read_mostly;
-static unsigned int nf_conntrack_hash_rnd __read_mostly;
+seqcount_spinlock_t nf_conntrack_generation __read_mostly;
+static siphash_aligned_key_t nf_conntrack_hash_rnd;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
const struct net *net)
{
- unsigned int n;
- u32 seed;
+ siphash_key_t key;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- /* The direction must be ignored, so we hash everything up to the
- * destination ports (which is a multiple of 4) and treat the last
- * three bytes manually.
- */
- seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
- n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, seed ^
- (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ key = nf_conntrack_hash_rnd;
+
+ key.key[0] ^= zoneid;
+ key.key[1] ^= net_hash_mix(net);
+
+ return siphash((void *)tuple,
+ offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend),
+ &key);
}
static u32 scale_hash(u32 hash)
@@ -209,15 +232,17 @@ static u32 scale_hash(u32 hash)
static u32 __hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
unsigned int size)
{
- return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}
static u32 hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid)
{
- return scale_hash(hash_conntrack_raw(tuple, net));
+ return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
}
static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
@@ -297,20 +322,15 @@ nf_ct_get_tuple(const struct sk_buff *skb,
return gre_pkt_to_tuple(skb, dataoff, net, tuple);
#endif
case IPPROTO_TCP:
- case IPPROTO_UDP: /* fallthrough */
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
+ case IPPROTO_UDP:
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
case IPPROTO_UDPLITE:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP:
- return nf_ct_get_tuple_ports(skb, dataoff, tuple);
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP:
+ /* fallthrough */
return nf_ct_get_tuple_ports(skb, dataoff, tuple);
-#endif
default:
break;
}
@@ -463,7 +483,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
*/
u32 nf_ct_get_id(const struct nf_conn *ct)
{
- static __read_mostly siphash_key_t ct_id_seed;
+ static siphash_aligned_key_t ct_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
@@ -482,10 +502,14 @@ u32 nf_ct_get_id(const struct nf_conn *ct)
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);
+static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct)
+{
+ return nf_ct_get_id(nf_ct_to_nf_conn(nfct));
+}
+
static void
clean_from_lists(struct nf_conn *ct)
{
- pr_debug("clean_from_lists(%p)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
@@ -493,53 +517,9 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_dying_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) dying list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->dying);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) unconfirmed list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->unconfirmed);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* We overload first tuple to link into unconfirmed or dying list.*/
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&pcpu->lock);
-}
-
#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
-/* Released via destroy_conntrack() */
+/* Released via nf_ct_destroy() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
gfp_t flags)
@@ -553,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
p = tmpl;
tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
- if (tmpl != p) {
- tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ if (tmpl != p)
tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
- }
} else {
tmpl = kzalloc(sizeof(*tmpl), flags);
if (!tmpl)
@@ -566,7 +544,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
nf_ct_zone_add(tmpl, zone);
- atomic_set(&tmpl->ct_general.use, 0);
+ refcount_set(&tmpl->ct_general.use, 1);
return tmpl;
}
@@ -574,7 +552,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
- nf_ct_ext_destroy(tmpl);
+ kfree(tmpl->ext);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -593,13 +571,11 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
#endif
}
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
+void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- pr_debug("destroy_conntrack(%p)\n", ct);
- WARN_ON(atomic_read(&nfct->use) != 0);
+ WARN_ON(refcount_read(&nfct->use) != 0);
if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
@@ -609,7 +585,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
destroy_gre_conntrack(ct);
- local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
@@ -617,64 +592,90 @@ destroy_conntrack(struct nf_conntrack *nfct)
*/
nf_ct_remove_expectations(ct);
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
- local_bh_enable();
-
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
nf_conntrack_free(ct);
}
+EXPORT_SYMBOL(nf_ct_destroy);
-static void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;
- nf_ct_helper_destroy(ct);
-
- local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
+}
- nf_ct_add_to_dying_list(ct);
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+
+ __nf_ct_delete_from_lists(ct);
local_bh_enable();
}
+static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
+
+ spin_lock(&cnet->ecache.dying_lock);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &cnet->ecache.dying_list);
+ spin_unlock(&cnet->ecache.dying_lock);
+#endif
+}
+
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
+ struct net *net;
if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
return false;
tstamp = nf_conn_tstamp_find(ct);
- if (tstamp && tstamp->stop == 0)
+ if (tstamp) {
+ s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
+
tstamp->stop = ktime_get_real_ns();
+ if (timeout < 0)
+ tstamp->stop -= jiffies_to_nsecs(-timeout);
+ }
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
- nf_ct_delete_from_lists(ct);
- nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+ __nf_ct_delete_from_lists(ct);
+ nf_ct_add_to_ecache_list(ct);
+ local_bh_enable();
+
+ nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
- nf_conntrack_ecache_work(nf_ct_net(ct));
+ net = nf_ct_net(ct);
+ if (nf_conntrack_ecache_dwork_pending(net))
+ nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
@@ -713,9 +714,12 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
- if (!atomic_inc_not_zero(&ct->ct_general.use))
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
return;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct))
nf_ct_kill(ct);
@@ -773,17 +777,18 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- rcu_read_lock();
-
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
/* We have a candidate that matches the tuple we're interested
* in, try to obtain a reference and re-check tuple
*/
ct = nf_ct_tuplehash_to_ctrack(h);
- if (likely(atomic_inc_not_zero(&ct->ct_general.use))) {
+ if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
+ /* re-check key after refcount */
+ smp_acquire__after_ctrl_dep();
+
if (likely(nf_ct_key_equal(h, tuple, zone, net)))
- goto found;
+ return h;
/* TYPESAFE_BY_RCU recycled the candidate */
nf_ct_put(ct);
@@ -791,8 +796,6 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
h = NULL;
}
-found:
- rcu_read_unlock();
return h;
}
@@ -801,8 +804,25 @@ struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
- return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, net));
+ unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ struct nf_conntrack_tuple_hash *thash;
+
+ rcu_read_lock();
+
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone_id, net));
+
+ if (thash)
+ goto out_unlock;
+
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (rid != zone_id)
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, rid, net));
+
+out_unlock:
+ rcu_read_unlock();
+ return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -816,6 +836,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
&nf_conntrack_hash[reply_hash]);
}
+static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
+{
+ /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
+ * may contain stale pointers to e.g. helper that has been removed.
+ *
+ * The helper can't clear this because the nf_conn object isn't in
+ * any hash and synchronize_rcu() isn't enough because associated skb
+ * might sit in a queue.
+ */
+ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
+}
+
+static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
+{
+ if (!ext)
+ return true;
+
+ if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
+ return false;
+
+ /* inserted into conntrack table, nf_ct_iterate_cleanup()
+ * will find it. Disable nf_ct_ext_find() id check.
+ */
+ WRITE_ONCE(ext->gen_id, 0);
+ return true;
+}
+
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
@@ -824,44 +871,77 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int max_chainlen;
+ unsigned int chainlen = 0;
unsigned int sequence;
+ int err = -EEXIST;
zone = nf_ct_zone(ct);
+ if (!nf_ct_ext_valid_pre(ct->ext))
+ return -EAGAIN;
+
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
+
+ max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
+
+ chainlen = 0;
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
+
+ /* If genid has changed, we can't insert anymore because ct
+ * extensions could have stale pointers and nf_ct_iterate_destroy
+ * might have completed its table scan already.
+ *
+ * Increment of the ext genid right after this check is fine:
+ * nf_ct_iterate_destroy blocks until locks are released.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ err = -EAGAIN;
+ goto out;
+ }
smp_wmb();
/* The caller holds a reference to this object */
- atomic_set(&ct->ct_general.use, 2);
+ refcount_set(&ct->ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
- return 0;
+ return 0;
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ err = -ENOSPC;
out:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return -EEXIST;
+ return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@@ -900,8 +980,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
{
struct nf_conn_tstamp *tstamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
+ refcount_inc(&ct->ct_general.use);
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
@@ -909,6 +988,57 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}
+/**
+ * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
+ * @ct1: conntrack in hash table to check against
+ * @ct2: merge candidate
+ *
+ * returns true if ct1 and ct2 happen to refer to the same flow, but
+ * in opposing directions, i.e.
+ * ct1: a:b -> c:d
+ * ct2: c:d -> a:b
+ * for both directions. If so, @ct2 should not have been created
+ * as the skb should have been picked up as ESTABLISHED flow.
+ * But ct1 was not yet committed to hash table before skb that created
+ * ct2 had arrived.
+ *
+ * Note we don't compare netns because ct entries in different net
+ * namespace cannot clash to begin with.
+ *
+ * @return: true if ct1 and ct2 are identical when swapping origin/reply.
+ */
+static bool
+nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ u16 id1, id2;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
+ return false;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
+ if (id1 != id2)
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);
+
+ return id1 == id2;
+}
+
+static int nf_ct_can_merge(const struct nf_conn *ct,
+ const struct nf_conn *loser_ct)
+{
+ return nf_ct_match(ct, loser_ct) ||
+ nf_ct_match_reverse(ct, loser_ct);
+}
+
+/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
{
@@ -919,26 +1049,19 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
loser_ct = nf_ct_get(skb, &ctinfo);
- if (nf_ct_is_dying(ct))
- return NF_DROP;
-
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- return NF_DROP;
-
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
+ if (nf_ct_can_merge(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);
+ nf_conntrack_get(&ct->ct_general);
+
nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_ct_add_to_dying_list(loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_put(loser_ct);
nf_ct_set(skb, ct, ctinfo);
- NF_CT_STAT_INC(net, insert_failed);
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
- nf_ct_put(ct);
return NF_DROP;
}
@@ -979,7 +1102,7 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
}
/* We want the clashing entry to go away real soon: 1 second timeout. */
- loser_ct->timeout = nfct_time_stamp + HZ;
+ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
/* IPS_NAT_CLASH removes the entry automatically on the first
* reply. Also prevents UDP tracker from moving the entry to
@@ -998,6 +1121,14 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
&nf_conntrack_hash[repl_idx]);
+ /* confirmed bit must be set after hlist add, not before:
+ * loser_ct can still be visible to other cpu due to
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &loser_ct->status);
+
+ NF_CT_STAT_INC(net, clash_resolve);
return NF_ACCEPT;
}
@@ -1006,12 +1137,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* @skb: skb that causes the clash
* @h: tuplehash of the clashing entry already in table
- * @hash_reply: hash slot for reply direction
+ * @reply_hash: hash slot for reply direction
*
* A conntrack entry can be inserted to the connection tracking table
* if there is no existing entry with an identical tuple.
*
- * If there is one, @skb (and the assocated, unconfirmed conntrack) has
+ * If there is one, @skb (and the associated, unconfirmed conntrack) has
* to be dropped. In case @skb is retransmitted, next conntrack lookup
* will find the already-existing entry.
*
@@ -1027,10 +1158,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
*
* Failing that, the new, unconfirmed conntrack is still added to the table
* provided that the collision only occurs in the ORIGINAL direction.
- * The new entry will be added after the existing one in the hash list,
+ * The new entry will be added only in the non-clashing REPLY direction,
* so packets in the ORIGINAL direction will continue to match the existing
* entry. The new entry will also have a fixed timeout so it expires --
- * due to the collision, it will not see bidirectional traffic.
+ * due to the collision, it will only see reply traffic.
*
* Returns NF_DROP if the clash could not be resolved.
*/
@@ -1062,7 +1193,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
return ret;
drop:
- nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
@@ -1072,6 +1202,7 @@ drop:
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
+ unsigned int chainlen = 0, sequence, max_chainlen;
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
@@ -1080,7 +1211,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- unsigned int sequence;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
@@ -1102,9 +1232,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
@@ -1123,34 +1253,48 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_DROP;
}
- pr_debug("Confirming conntrack %p\n", ct);
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ goto dying;
+ }
+
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
if (unlikely(nf_ct_is_dying(ct))) {
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
+ max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ chainlen = 0;
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen) {
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ NF_CT_STAT_INC(net, insert_failed);
+ ret = NF_DROP;
+ goto dying;
+ }
+ }
- /* Timer relative to confirmation time, not original
+ /* Timeout is relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
@@ -1158,14 +1302,34 @@ __nf_conntrack_confirm(struct sk_buff *skb)
__nf_conntrack_insert_prepare(ct);
/* Since the lookup is lockless, hash insertion must be done after
- * starting the timer and setting the CONFIRMED bit. The RCU barriers
- * guarantee that no other CPU can find the conntrack before the above
- * stores are visible.
+ * setting ct->timeout. The RCU barriers guarantee that no other CPU
+ * can find the conntrack before the above stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
+
+ /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups
+ * skip entries that lack this bit. This happens when a CPU is looking
+ * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU
+ * or when another CPU encounters this entry right after the insertion
+ * but before the set-confirm-bit below. This bit must not be set until
+ * after __nf_conntrack_hash_insert().
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
+ /* ext area is still valid (rcu read lock is held,
+ * but will go out of scope soon, we need to remove
+ * this conntrack again.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ return NF_DROP;
+ }
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -1183,7 +1347,7 @@ dying:
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
-/* Returns true if a connection correspondings to the tuple (required
+/* Returns true if a connection corresponds to the tuple (required
for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
@@ -1202,7 +1366,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
rcu_read_lock();
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
- hash = __hash_conntrack(net, tuple, hsize);
+ hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1229,7 +1393,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
* Let nf_ct_resolve_clash() deal with this later.
*/
if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
continue;
NF_CT_STAT_INC_ATOMIC(net, found);
@@ -1264,9 +1429,6 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
- continue;
-
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
@@ -1277,9 +1439,12 @@ static unsigned int early_drop_list(struct net *net,
nf_ct_is_dying(tmp))
continue;
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->ct_net and ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
/* kill only if still in same netns -- might have moved due to
* SLAB_TYPESAFE_BY_RCU rules.
*
@@ -1333,68 +1498,78 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct)
static bool gc_worker_can_early_drop(const struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
+ u8 protonum = nf_ct_protonum(ct);
if (!test_bit(IPS_ASSURED_BIT, &ct->status))
return true;
- l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(protonum);
if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
return true;
return false;
}
-#define DAY (86400 * HZ)
-
-/* Set an arbitrary timeout large enough not to ever expire, this save
- * us a check for the IPS_OFFLOAD_BIT from the packet path via
- * nf_ct_is_expired().
- */
-static void nf_ct_offload_timeout(struct nf_conn *ct)
-{
- if (nf_ct_expires(ct) < DAY / 2)
- ct->timeout = nfct_time_stamp + DAY;
-}
-
static void gc_worker(struct work_struct *work)
{
- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
- unsigned int i, goal, buckets = 0, expired_count = 0;
- unsigned int nf_conntrack_max95 = 0;
+ unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
- unsigned int ratio, scanned = 0;
+ unsigned int expired_count = 0;
unsigned long next_run;
+ s32 delta_time;
+ long count;
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
- goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
- i = gc_work->last_bucket;
+ i = gc_work->next_bucket;
if (gc_work->early_drop)
nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+ if (i == 0) {
+ gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+ gc_work->count = GC_SCAN_INITIAL_COUNT;
+ gc_work->start_time = start_time;
+ }
+
+ next_run = gc_work->avg_timeout;
+ count = gc_work->count;
+
+ end_time = start_time + GC_SCAN_MAX_DURATION;
+
do {
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int hashsz;
struct nf_conn *tmp;
- i++;
rcu_read_lock();
nf_conntrack_get_ht(&ct_hash, &hashsz);
- if (i >= hashsz)
- i = 0;
+ if (i >= hashsz) {
+ rcu_read_unlock();
+ break;
+ }
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ struct nf_conntrack_net *cnet;
struct net *net;
+ long expires;
tmp = nf_ct_tuplehash_to_ctrack(h);
- scanned++;
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- continue;
+ if (expired_count > GC_SCAN_EXPIRED_MAX) {
+ rcu_read_unlock();
+
+ gc_work->next_bucket = i;
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+
+ delta_time = nfct_time_stamp - gc_work->start_time;
+
+ /* re-sched immediately if total cycle time is exceeded */
+ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+ goto early_exit;
}
if (nf_ct_is_expired(tmp)) {
@@ -1403,24 +1578,34 @@ static void gc_worker(struct work_struct *work)
continue;
}
+ expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+ expires = (expires - (long)next_run) / ++count;
+ next_run += expires;
+
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
net = nf_ct_net(tmp);
- if (atomic_read(&net->ct.count) < nf_conntrack_max95)
+ cnet = nf_ct_pernet(net);
+ if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
/* need to take reference to avoid possible races */
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (gc_worker_skip_ct(tmp)) {
nf_ct_put(tmp);
continue;
}
- if (gc_worker_can_early_drop(tmp))
+ if (gc_worker_can_early_drop(tmp)) {
nf_ct_kill(tmp);
+ expired_count++;
+ }
nf_ct_put(tmp);
}
@@ -1431,51 +1616,41 @@ static void gc_worker(struct work_struct *work)
*/
rcu_read_unlock();
cond_resched();
- } while (++buckets < goal);
+ i++;
- if (gc_work->exiting)
- return;
+ delta_time = nfct_time_stamp - end_time;
+ if (delta_time > 0 && i < hashsz) {
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+ gc_work->next_bucket = i;
+ next_run = 0;
+ goto early_exit;
+ }
+ } while (i < hashsz);
- /*
- * Eviction will normally happen from the packet path, and not
- * from this gc worker.
- *
- * This worker is only here to reap expired entries when system went
- * idle after a busy period.
- *
- * The heuristics below are supposed to balance conflicting goals:
- *
- * 1. Minimize time until we notice a stale entry
- * 2. Maximize scan intervals to not waste cycles
- *
- * Normally, expire ratio will be close to 0.
- *
- * As soon as a sizeable fraction of the entries have expired
- * increase scan frequency.
- */
- ratio = scanned ? expired_count * 100 / scanned : 0;
- if (ratio > GC_EVICT_RATIO) {
- gc_work->next_gc_run = min_interval;
- } else {
- unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
+ gc_work->next_bucket = 0;
- BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
+ next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
- gc_work->next_gc_run += min_interval;
- if (gc_work->next_gc_run > max)
- gc_work->next_gc_run = max;
- }
+ delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+ if (next_run > (unsigned long)delta_time)
+ next_run -= delta_time;
+ else
+ next_run = 1;
+
+early_exit:
+ if (gc_work->exiting)
+ return;
+
+ if (next_run)
+ gc_work->early_drop = false;
- next_run = gc_work->next_gc_run;
- gc_work->last_bucket = i;
- gc_work->early_drop = false;
queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
- INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
- gc_work->next_gc_run = HZ;
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
gc_work->exiting = false;
}
@@ -1486,18 +1661,23 @@ __nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ unsigned int ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
- atomic_inc(&net->ct.count);
+ ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max &&
- unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+ if (unlikely(ct_count > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
- atomic_dec(&net->ct.count);
- net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ atomic_dec(&cnet->count);
+ if (net == &init_net)
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ else
+ net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n",
+ net->ns.inum);
return ERR_PTR(-ENOMEM);
}
}
@@ -1517,21 +1697,19 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
- ct->timeout = 0;
+ WRITE_ONCE(ct->timeout, 0);
write_pnet(&ct->ct_net, net);
- memset(&ct->__nfct_init_offset, 0,
- offsetof(struct nf_conn, proto) -
- offsetof(struct nf_conn, __nfct_init_offset));
+ memset_after(ct, 0, __nfct_init_offset);
nf_ct_zone_add(ct, zone);
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
- atomic_set(&ct->ct_general.use, 0);
+ refcount_set(&ct->ct_general.use, 0);
return ct;
out:
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
return ERR_PTR(-ENOMEM);
}
@@ -1548,16 +1726,29 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_net *cnet;
/* A freed object has refcnt == 0, that's
* the golden rule for SLAB_TYPESAFE_BY_RCU
*/
- WARN_ON(atomic_read(&ct->ct_general.use) != 0);
+ WARN_ON(refcount_read(&ct->ct_general.use) != 0);
+
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ const struct nf_nat_hook *nat_hook;
+
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (nat_hook)
+ nat_hook->remove_nat_bysrc(ct);
+ rcu_read_unlock();
+ }
- nf_ct_ext_destroy(ct);
+ kfree(ct->ext);
kmem_cache_free(nf_conntrack_cachep, ct);
+ cnet = nf_ct_pernet(net);
+
smp_mb__before_atomic();
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1573,22 +1764,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_ecache *ecache;
+#endif
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
+ struct nf_conntrack_net *cnet;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {
- pr_debug("Can't invert tuple.\n");
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple))
return NULL;
- }
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
- return (struct nf_conntrack_tuple_hash *)ct;
+ return ERR_CAST(ct);
if (!nf_ct_add_synproxy(ct, tmpl)) {
nf_conntrack_free(ct);
@@ -1605,18 +1797,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
- nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
- ecache ? ecache->expmask : 0,
- GFP_ATOMIC);
- local_bh_disable();
- if (net->ct.expect_count) {
- spin_lock(&nf_conntrack_expect_lock);
- exp = nf_ct_find_expectation(net, zone, tuple);
+ if ((ecache || net->ct.sysctl_events) &&
+ !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
+ }
+#endif
+
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl));
if (exp) {
- pr_debug("expectation arrives ct=%p exp=%p\n",
- ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
@@ -1628,23 +1825,30 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- ct->mark = exp->master->mark;
+ ct->mark = READ_ONCE(exp->master->mark);
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
ct->secmark = exp->master->secmark;
#endif
NF_CT_STAT_INC(net, expect_new);
}
- spin_unlock(&nf_conntrack_expect_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
- if (!exp)
+ if (!exp && tmpl)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- /* Now it is inserted into the unconfirmed list, bump refcount */
- nf_conntrack_get(&ct->ct_general);
- nf_ct_add_to_unconfirmed_list(ct);
+ /* Other CPU might have obtained a pointer to this object before it was
+ * released. Because refcount is 0, refcount_inc_not_zero() will fail.
+ *
+ * After refcount_set(1) it will succeed; ensure that zeroing of
+ * ct->status and the correct ct->net pointer are visible; else other
+ * core might observe CONFIRMED bit which means the entry is valid and
+ * in the hash table, but its not (anymore).
+ */
+ smp_wmb();
- local_bh_enable();
+ /* Now it is going to be associated with an sk_buff, set refcount to 1. */
+ refcount_set(&ct->ct_general.use, 1);
if (exp) {
if (exp->expectfn)
@@ -1668,20 +1872,30 @@ resolve_normal_ct(struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
+ u32 hash, zone_id, rid;
struct nf_conn *ct;
- u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, state->pf, protonum, state->net,
- &tuple)) {
- pr_debug("Can't get tuple\n");
+ &tuple))
return 0;
- }
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple, state->net);
+
+ zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ hash = hash_conntrack_raw(&tuple, zone_id, state->net);
h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
+
+ if (!h) {
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (zone_id != rid) {
+ u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
+
+ h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
+ }
+ }
+
if (!h) {
h = init_conntrack(state->net, tmpl, &tuple,
skb, dataoff, hash);
@@ -1696,17 +1910,15 @@ resolve_normal_ct(struct nf_conn *tmpl,
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
ctinfo = IP_CT_ESTABLISHED_REPLY;
} else {
+ unsigned long status = READ_ONCE(ct->status);
+
/* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("normal packet for %p\n", ct);
+ if (likely(status & IPS_SEEN_REPLY))
ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("related packet for %p\n", ct);
+ else if (status & IPS_EXPECTED)
ctinfo = IP_CT_RELATED;
- } else {
- pr_debug("new packet for %p\n", ct);
+ else
ctinfo = IP_CT_NEW;
- }
}
nf_ct_set(skb, ct, ctinfo);
return 0;
@@ -1737,10 +1949,8 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl,
else
return NF_ACCEPT;
- if (ret <= 0) {
+ if (ret <= 0)
NF_CT_STAT_INC_ATOMIC(state->net, error);
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- }
return ret;
}
@@ -1787,11 +1997,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
return nf_conntrack_sctp_packet(ct, skb, dataoff,
ctinfo, state);
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP:
- return nf_conntrack_dccp_packet(ct, skb, dataoff,
- ctinfo, state);
-#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
case IPPROTO_GRE:
return nf_conntrack_gre_packet(ct, skb, dataoff,
@@ -1814,18 +2019,14 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
- ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(state->net, ignore);
+ ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
- }
skb->_nfct = 0;
}
/* rcu_read_lock()ed by nf_hook_thresh */
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
- pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
@@ -1864,18 +2065,19 @@ repeat:
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
- pr_debug("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
skb->_nfct = 0;
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
- NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
*/
if (ret == -NF_REPEAT)
goto repeat;
+
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ if (ret == NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
+
ret = -ret;
goto out;
}
@@ -1891,35 +2093,11 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_in);
-/* Alter reply tuple (maybe alter helper). This is for NAT, and is
- implicitly racy: see __nf_conntrack_confirm */
-void nf_conntrack_alter_reply(struct nf_conn *ct,
- const struct nf_conntrack_tuple *newreply)
-{
- struct nf_conn_help *help = nfct_help(ct);
-
- /* Should be unconfirmed, so not in hash table yet */
- WARN_ON(nf_ct_is_confirmed(ct));
-
- pr_debug("Altering reply tuple of %p to ", ct);
- nf_ct_dump_tuple(newreply);
-
- ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (ct->master || (help && !hlist_empty(&help->expectations)))
- return;
-
- rcu_read_lock();
- __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
-
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
u32 extra_jiffies,
- bool do_acct)
+ unsigned int bytes)
{
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
@@ -1932,8 +2110,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (READ_ONCE(ct->timeout) != extra_jiffies)
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
- if (do_acct)
- nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
+ if (bytes)
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -2025,74 +2203,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
-static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
- struct nf_nat_hook *nat_hook;
- unsigned int status;
- int dataoff;
- u16 l3num;
- u8 l4num;
-
- l3num = nf_ct_l3num(ct);
-
- dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
- if (dataoff <= 0)
- return -1;
-
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple))
- return -1;
-
- if (ct->status & IPS_SRC_NAT) {
- memcpy(tuple.src.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
- sizeof(tuple.src.u3.all));
- tuple.src.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
- }
-
- if (ct->status & IPS_DST_NAT) {
- memcpy(tuple.dst.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
- sizeof(tuple.dst.u3.all));
- tuple.dst.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
- }
-
- h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
- if (!h)
- return 0;
-
- /* Store status bits of the conntrack that is clashing to re-do NAT
- * mangling according to what it has been done already to this packet.
- */
- status = ct->status;
-
- nf_ct_put(ct);
- ct = nf_ct_tuplehash_to_ctrack(h);
- nf_ct_set(skb, ct, ctinfo);
-
- nat_hook = rcu_dereference(nf_nat_hook);
- if (!nat_hook)
- return 0;
-
- if (status & IPS_SRC_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
-
- if (status & IPS_DST_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
-
- return 0;
-}
-
/* This packet is coming from userspace via nf_queue, complete the packet
* processing after the helper invocation in nf_confirm().
*/
@@ -2105,11 +2215,14 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
help = nfct_help(ct);
if (!help)
- return 0;
+ return NF_ACCEPT;
helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
- return 0;
+ return NF_ACCEPT;
switch (nf_ct_l3num(ct)) {
case NFPROTO_IPV4:
@@ -2124,41 +2237,34 @@ static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
&frag_off);
if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
- return 0;
+ return NF_ACCEPT;
break;
}
#endif
default:
- return 0;
+ return NF_ACCEPT;
}
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
!nf_is_loopback_packet(skb)) {
if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return -1;
+ return NF_DROP;
}
}
/* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0;
+ return nf_conntrack_confirm(skb);
}
static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
- int err;
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
- return 0;
-
- if (!nf_ct_is_confirmed(ct)) {
- err = __nf_conntrack_update(net, skb, ct, ctinfo);
- if (err < 0)
- return err;
- }
+ return NF_ACCEPT;
return nf_confirm_cthelper(skb, ct, ctinfo);
}
@@ -2201,7 +2307,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
- void *data, unsigned int *bucket)
+ const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -2209,28 +2315,36 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
spinlock_t *lockp;
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
+ struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < nf_conntrack_htable_size) {
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
- continue;
- /* All nf_conn objects are added to hash table twice, one
- * for original direction tuple, once for the reply tuple.
- *
- * Exception: In the IPS_NAT_CLASH case, only the reply
- * tuple is added (the original tuple already existed for
- * a different object).
- *
- * We only need to call the iterator once for each
- * conntrack, so we just use the 'reply' direction
- * tuple while iterating.
- */
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
- goto found;
- }
+ hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
+ continue;
+ /* All nf_conn objects are added to hash table twice, one
+ * for original direction tuple, once for the reply tuple.
+ *
+ * Exception: In the IPS_NAT_CLASH case, only the reply
+ * tuple is added (the original tuple already existed for
+ * a different object).
+ *
+ * We only need to call the iterator once for each
+ * conntrack, so we just use the 'reply' direction
+ * tuple while iterating.
+ */
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (iter_data->net &&
+ !net_eq(iter_data->net, nf_ct_net(ct)))
+ continue;
+
+ if (iter(ct, iter_data->data))
+ goto found;
}
spin_unlock(lockp);
local_bh_enable();
@@ -2239,109 +2353,43 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
return NULL;
found:
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
spin_unlock(lockp);
local_bh_enable();
return ct;
}
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
+ const struct nf_ct_iter_data *iter_data)
{
- unsigned int bucket = 0, sequence;
+ unsigned int bucket = 0;
struct nf_conn *ct;
might_sleep();
- for (;;) {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
-
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
- /* Time to push up daises... */
-
- nf_ct_delete(ct, portid, report);
- nf_ct_put(ct);
- cond_resched();
- }
-
- if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
- break;
- bucket = 0;
- }
-}
-
-struct iter_data {
- int (*iter)(struct nf_conn *i, void *data);
- void *data;
- struct net *net;
-};
-
-static int iter_net_only(struct nf_conn *i, void *data)
-{
- struct iter_data *d = data;
-
- if (!net_eq(d->net, nf_ct_net(i)))
- return 0;
-
- return d->iter(i, d->data);
-}
-
-static void
-__nf_ct_unconfirmed_destroy(struct net *net)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
- struct ct_pcpu *pcpu;
-
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
- struct nf_conn *ct;
-
- ct = nf_ct_tuplehash_to_ctrack(h);
+ mutex_lock(&nf_conntrack_mutex);
+ while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
+ /* Time to push up daises... */
- /* we cannot call iter() on unconfirmed list, the
- * owning cpu can reallocate ct->ext at any time.
- */
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&pcpu->lock);
+ nf_ct_delete(ct, iter_data->portid, iter_data->report);
+ nf_ct_put(ct);
cond_resched();
}
+ mutex_unlock(&nf_conntrack_mutex);
}
-void nf_ct_unconfirmed_destroy(struct net *net)
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data)
{
- might_sleep();
-
- if (atomic_read(&net->ct.count) > 0) {
- __nf_ct_unconfirmed_destroy(net);
- nf_queue_nf_hook_drop(net);
- synchronize_net();
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
-
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
-{
- struct iter_data d;
+ struct net *net = iter_data->net;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
might_sleep();
- if (atomic_read(&net->ct.count) == 0)
+ if (atomic_read(&cnet->count) == 0)
return;
- d.iter = iter;
- d.data = data;
- d.net = net;
-
- nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
+ nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
@@ -2359,43 +2407,56 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
down_read(&net_rwsem);
for_each_net(net) {
- if (atomic_read(&net->ct.count) == 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (atomic_read(&cnet->count) == 0)
continue;
- __nf_ct_unconfirmed_destroy(net);
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
/* Need to wait for netns cleanup worker to finish, if its
* running -- it might have deleted a net namespace from
- * the global list, so our __nf_ct_unconfirmed_destroy() might
- * not have affected all namespaces.
+ * the global list, so hook drop above might not have
+ * affected all namespaces.
*/
net_ns_barrier();
- /* a conntrack could have been unlinked from unconfirmed list
- * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
+ /* a skb w. unconfirmed conntrack could have been reinjected just
+ * before we called nf_queue_nf_hook_drop().
+ *
* This makes sure its inserted into conntrack table.
*/
synchronize_net();
- nf_ct_iterate_cleanup(iter, data, 0, 0);
+ nf_ct_ext_bump_genid();
+ iter_data.data = data;
+ nf_ct_iterate_cleanup(iter, &iter_data);
+
+ /* Another cpu might be in a rcu read section with
+ * rcu protected pointer cleared in iter callback
+ * or hidden via nf_ct_ext_bump_genid() above.
+ *
+ * Wait until those are done.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
static int kill_all(struct nf_conn *i, void *data)
{
- return net_eq(nf_ct_net(i), data);
+ return 1;
}
void nf_conntrack_cleanup_start(void)
{
+ cleanup_nf_conntrack_bpf();
conntrack_gc_work.exiting = true;
- RCU_INIT_POINTER(ip_ct_attach, NULL);
}
void nf_conntrack_cleanup_end(void)
@@ -2405,13 +2466,7 @@ void nf_conntrack_cleanup_end(void)
kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
- nf_conntrack_seqadj_fini();
- nf_conntrack_labels_fini();
nf_conntrack_helper_fini();
- nf_conntrack_timeout_fini();
- nf_conntrack_ecache_fini();
- nf_conntrack_tstamp_fini();
- nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2431,20 +2486,24 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- int busy;
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
+ int busy;
/*
* This makes sure all current packets have passed through
* netfilter framework. Roll on, two-stage module
* delete...
*/
- synchronize_net();
+ synchronize_rcu_expedited();
i_see_dead_people:
busy = 0;
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_iterate_cleanup(kill_all, net, 0, 0);
- if (atomic_read(&net->ct.count) != 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ iter_data.net = net;
+ nf_ct_iterate_cleanup_net(kill_all, &iter_data);
+ if (atomic_read(&cnet->count) != 0)
busy = 1;
}
if (busy) {
@@ -2453,11 +2512,9 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
- free_percpu(net->ct.pcpu_lists);
}
}
@@ -2466,12 +2523,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
- if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+ if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
+ if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head)))
+ return NULL;
+
hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
if (hash && nulls)
@@ -2497,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
if (!hash)
return -ENOMEM;
+ mutex_lock(&nf_conntrack_mutex);
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
+ mutex_unlock(&nf_conntrack_mutex);
kvfree(hash);
return 0;
}
@@ -2515,16 +2577,19 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
for (i = 0; i < nf_conntrack_htable_size; i++) {
while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ unsigned int zone_id;
+
h = hlist_nulls_entry(nf_conntrack_hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
+
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
bucket = __hash_conntrack(nf_ct_net(ct),
- &h->tuple, hashsize);
+ &h->tuple, zone_id, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = nf_conntrack_htable_size;
old_hash = nf_conntrack_hash;
nf_conntrack_hash = hash;
@@ -2534,6 +2599,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
nf_conntrack_all_unlock();
local_bh_enable();
+ mutex_unlock(&nf_conntrack_mutex);
+
synchronize_net();
kvfree(old_hash);
return 0;
@@ -2558,36 +2625,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-static __always_inline unsigned int total_extension_size(void)
-{
- /* remember to add new extensions below */
- BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
-
- return sizeof(struct nf_ct_ext) +
- sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
- + sizeof(struct nf_conn_nat)
-#endif
- + sizeof(struct nf_conn_seqadj)
- + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
- + sizeof(struct nf_conn_synproxy)
-#endif
- ;
-};
-
int nf_conntrack_init_start(void)
{
unsigned long nr_pages = totalram_pages();
@@ -2595,35 +2632,31 @@ int nf_conntrack_init_start(void)
int ret = -ENOMEM;
int i;
- /* struct nf_ct_ext uses u8 to store offsets/size */
- BUILD_BUG_ON(total_extension_size() > 255u);
-
- seqcount_init(&nf_conntrack_generation);
+ seqcount_spinlock_init(&nf_conntrack_generation,
+ &nf_conntrack_locks_all_lock);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
if (!nf_conntrack_htable_size) {
- /* Idea from tcp.c: use 1/16384 of memory.
- * On i386: 32MB machine has 512 buckets.
- * >= 1GB machines have 16384 buckets.
- * >= 4GB machines have 65536 buckets.
- */
nf_conntrack_htable_size
= (((nr_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
- if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
- nf_conntrack_htable_size = 65536;
+ if (BITS_PER_LONG >= 64 &&
+ nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+ nf_conntrack_htable_size = 262144;
else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
- nf_conntrack_htable_size = 16384;
- if (nf_conntrack_htable_size < 32)
- nf_conntrack_htable_size = 32;
+ nf_conntrack_htable_size = 65536;
- /* Use a max. factor of four by default to get the same max as
- * with the old struct list_heads. When a table size is given
- * we use the old value of 8 to avoid reducing the max.
- * entries. */
- max_factor = 4;
+ if (nf_conntrack_htable_size < 1024)
+ nf_conntrack_htable_size = 1024;
+ /* Use a max. factor of one by default to keep the average
+ * hash chain length at 2 entries. Each entry has to be added
+ * twice (once for original direction, once for reply).
+ * When a table size is given we use the old value of 8 to
+ * avoid implicit reduction of the max entries setting.
+ */
+ max_factor = 1;
}
nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
@@ -2643,34 +2676,10 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init();
- if (ret < 0)
- goto err_acct;
-
- ret = nf_conntrack_tstamp_init();
- if (ret < 0)
- goto err_tstamp;
-
- ret = nf_conntrack_ecache_init();
- if (ret < 0)
- goto err_ecache;
-
- ret = nf_conntrack_timeout_init();
- if (ret < 0)
- goto err_timeout;
-
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
- ret = nf_conntrack_labels_init();
- if (ret < 0)
- goto err_labels;
-
- ret = nf_conntrack_seqadj_init();
- if (ret < 0)
- goto err_seqadj;
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -2678,23 +2687,18 @@ int nf_conntrack_init_start(void)
conntrack_gc_work_init(&conntrack_gc_work);
queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+ ret = register_nf_conntrack_bpf();
+ if (ret < 0)
+ goto err_kfunc;
+
return 0;
+err_kfunc:
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+ nf_conntrack_proto_fini();
err_proto:
- nf_conntrack_seqadj_fini();
-err_seqadj:
- nf_conntrack_labels_fini();
-err_labels:
nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_timeout_fini();
-err_timeout:
- nf_conntrack_ecache_fini();
-err_ecache:
- nf_conntrack_tstamp_fini();
-err_tstamp:
- nf_conntrack_acct_fini();
-err_acct:
nf_conntrack_expect_fini();
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2703,16 +2707,29 @@ err_cachep:
return ret;
}
-static struct nf_ct_hook nf_conntrack_hook = {
+static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
+{
+ struct nf_conn *ct = nf_ct_to_nf_conn(nfct);
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ nf_conntrack_tcp_set_closing(ct);
+ break;
+ }
+}
+
+static const struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
- .destroy = destroy_conntrack,
+ .destroy = nf_ct_destroy,
.get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
+ .set_closing = nf_conntrack_set_closing,
+ .confirm = __nf_conntrack_confirm,
+ .get_id = nf_conntrack_get_id,
};
void nf_conntrack_init_end(void)
{
- /* For use by REJECT target */
- RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
@@ -2720,32 +2737,19 @@ void nf_conntrack_init_end(void)
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
-#define DYING_NULLS_VAL ((1<<30)+1)
int nf_conntrack_init_net(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
int ret = -ENOMEM;
- int cpu;
BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
- atomic_set(&net->ct.count, 0);
-
- net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
- if (!net->ct.pcpu_lists)
- goto err_stat;
-
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_init(&pcpu->lock);
- INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- }
+ atomic_set(&cnet->count, 0);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat)
- goto err_pcpu_lists;
+ return ret;
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
@@ -2754,15 +2758,67 @@ int nf_conntrack_init_net(struct net *net)
nf_conntrack_acct_pernet_init(net);
nf_conntrack_tstamp_pernet_init(net);
nf_conntrack_ecache_pernet_init(net);
- nf_conntrack_helper_pernet_init(net);
nf_conntrack_proto_pernet_init(net);
return 0;
err_expect:
free_percpu(net->ct.stat);
-err_pcpu_lists:
- free_percpu(net->ct.pcpu_lists);
-err_stat:
return ret;
}
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return -EPERM;
+
+ __nf_ct_set_timeout(ct, timeout);
+
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+ unsigned int bit;
+
+ /* Ignore these unchangable bits */
+ on &= ~IPS_UNCHANGEABLE_MASK;
+ off &= ~IPS_UNCHANGEABLE_MASK;
+
+ for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+ if (on & (1 << bit))
+ set_bit(bit, &ct->status);
+ else if (off & (1 << bit))
+ clear_bit(bit, &ct->status);
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+ unsigned long d;
+
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ __nf_ct_change_status(ct, status, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 7956c9f19899..81baf2082604 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
-#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -29,8 +28,9 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-#define ECACHE_RETRY_WAIT (HZ/10)
-#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
+#define DYING_NULLS_VAL ((1 << 30) + 1)
+#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
+#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
@@ -38,150 +38,173 @@ enum retry_state {
STATE_DONE,
};
-static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
- struct nf_conn *refs[ECACHE_STACK_ALLOC];
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return &cnet->ecache;
+}
+#if IS_MODULE(CONFIG_NF_CT_NETLINK)
+EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
+#endif
+
+static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
+{
+ unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
+ struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int evicted = 0;
+ unsigned int sent;
- spin_lock(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
- hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+next:
+ sent = 0;
+ spin_lock_bh(&cnet->ecache.dying_lock);
+
+ hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- struct nf_conntrack_ecache *e;
-
- if (!nf_ct_is_confirmed(ct))
- continue;
-
- /* This ecache access is safe because the ct is on the
- * pcpu dying list and we hold the spinlock -- the entry
- * cannot be free'd until after the lock is released.
- *
- * This is true even if ct has a refcount of 0: the
- * cpu that is about to free the entry must remove it
- * from the dying list and needs the lock to do so.
- */
- e = nf_ct_ecache_find(ct);
- if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
- continue;
- /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
- * the worker owns this entry: the ct will remain valid
- * until the worker puts its ct reference.
+ /* The worker owns all entries, ct remains valid until nf_ct_put
+ * in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
- e->state = NFCT_ECACHE_DESTROY_SENT;
- refs[evicted] = ct;
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
- if (++evicted >= ARRAY_SIZE(refs)) {
+ if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
+
+ if (sent++ > 16) {
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+ cond_resched();
+ goto next;
+ }
}
- spin_unlock(&pcpu->lock);
+ spin_unlock_bh(&cnet->ecache.dying_lock);
- /* can't _put while holding lock */
- while (evicted)
- nf_ct_put(refs[--evicted]);
+ hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+ nf_ct_put(ct);
+
+ cond_resched();
+ }
return ret;
}
static void ecache_work(struct work_struct *work)
{
- struct netns_ct *ctnet =
- container_of(work, struct netns_ct, ecache_dwork.work);
- int cpu, delay = -1;
- struct ct_pcpu *pcpu;
+ struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
+ int ret, delay = -1;
+
+ ret = ecache_work_evict_list(cnet);
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_JIFFIES;
+ break;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
+ }
- local_bh_disable();
+ if (delay >= 0)
+ schedule_delayed_work(&cnet->ecache.dwork, delay);
+}
- for_each_possible_cpu(cpu) {
- enum retry_state ret;
+static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
+ const u32 events,
+ const u32 missed,
+ const struct nf_ct_event *item)
+{
+ struct net *net = nf_ct_net(item->ct);
+ struct nf_ct_event_notifier *notify;
+ u32 old, want;
+ int ret;
- pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
+ if (!((events | missed) & e->ctmask))
+ return 0;
- ret = ecache_work_evict_list(pcpu);
+ rcu_read_lock();
- switch (ret) {
- case STATE_CONGESTED:
- delay = ECACHE_RETRY_WAIT;
- goto out;
- case STATE_RESTART:
- delay = 0;
- break;
- case STATE_DONE:
- break;
- }
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (!notify) {
+ rcu_read_unlock();
+ return 0;
}
- out:
- local_bh_enable();
+ ret = notify->ct_event(events | missed, item);
+ rcu_read_unlock();
- ctnet->ecache_dwork_pending = delay > 0;
- if (delay >= 0)
- schedule_delayed_work(&ctnet->ecache_dwork, delay);
+ if (likely(ret >= 0 && missed == 0))
+ return 0;
+
+ do {
+ old = READ_ONCE(e->missed);
+ if (ret < 0)
+ want = old | events;
+ else
+ want = old & ~missed;
+ } while (cmpxchg(&e->missed, old, want) != old);
+
+ return ret;
+}
+
+static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (local64_read(&e->timestamp))
+ local64_set(&e->timestamp, ktime_get_real_ns());
+#endif
}
-int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
- int ret = 0;
- struct net *net = nf_ct_net(ct);
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ unsigned int missed;
+ int ret;
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (!notify)
- goto out_unlock;
+ if (!nf_ct_is_confirmed(ct))
+ return 0;
e = nf_ct_ecache_find(ct);
if (!e)
- goto out_unlock;
+ return 0;
- if (nf_ct_is_confirmed(ct)) {
- struct nf_ct_event item = {
- .ct = ct,
- .portid = e->portid ? e->portid : portid,
- .report = report
- };
- /* This is a resent of a destroy event? If so, skip missed */
- unsigned long missed = e->portid ? 0 : e->missed;
-
- if (!((eventmask | missed) & e->ctmask))
- goto out_unlock;
-
- ret = notify->fcn(eventmask | missed, &item);
- if (unlikely(ret < 0 || missed)) {
- spin_lock_bh(&ct->lock);
- if (ret < 0) {
- /* This is a destroy event that has been
- * triggered by a process, we store the PORTID
- * to include it in the retransmission.
- */
- if (eventmask & (1 << IPCT_DESTROY)) {
- if (e->portid == 0 && portid != 0)
- e->portid = portid;
- e->state = NFCT_ECACHE_DESTROY_FAIL;
- } else {
- e->missed |= eventmask;
- }
- } else {
- e->missed &= ~missed;
- }
- spin_unlock_bh(&ct->lock);
- }
+ memset(&item, 0, sizeof(item));
+
+ item.ct = ct;
+ item.portid = e->portid ? e->portid : portid;
+ item.report = report;
+
+ /* This is a resent of a destroy event? If so, skip missed */
+ missed = e->portid ? 0 : e->missed;
+
+ nf_ct_ecache_tstamp_refresh(e);
+
+ ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
+ if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
+ /* This is a destroy event that has been triggered by a process,
+ * we store the PORTID to include it in the retransmission.
+ */
+ if (e->portid == 0 && portid != 0)
+ e->portid = portid;
}
-out_unlock:
- rcu_read_unlock();
+
return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
@@ -190,53 +213,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
- unsigned long events, missed;
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- int ret;
-
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (notify == NULL)
- goto out_unlock;
+ unsigned int events;
if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
- goto out_unlock;
+ return;
e = nf_ct_ecache_find(ct);
if (e == NULL)
- goto out_unlock;
+ return;
events = xchg(&e->cache, 0);
- /* We make a copy of the missed event cache without taking
- * the lock, thus we may send missed events twice. However,
- * this does not harm and it happens very rarely. */
- missed = e->missed;
-
- if (!((events | missed) & e->ctmask))
- goto out_unlock;
-
item.ct = ct;
item.portid = 0;
item.report = 0;
- ret = notify->fcn(events | missed, &item);
-
- if (likely(ret == 0 && !missed))
- goto out_unlock;
-
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
-
-out_unlock:
- rcu_read_unlock();
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely.
+ */
+ __nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
@@ -246,11 +244,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
{
struct net *net = nf_ct_exp_net(exp);
- struct nf_exp_event_notifier *notify;
+ struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_expect_event_cb);
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
if (!notify)
goto out_unlock;
@@ -264,118 +262,120 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
.portid = portid,
.report = report
};
- notify->fcn(1 << event, &item);
+ notify->exp_event(1 << event, &item);
}
out_unlock:
rcu_read_unlock();
}
-int nf_conntrack_register_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_register_notifier(struct net *net,
+ const struct nf_ct_event_notifier *new)
{
- int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ WARN_ON_ONCE(notify);
rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
- ret = 0;
-
-out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
-void nf_conntrack_unregister_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net)
{
- struct nf_ct_event_notifier *notify;
-
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ /* synchronize_rcu() is called after netns pre_exit */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
-int nf_ct_expect_register_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
{
- int ret;
- struct nf_exp_event_notifier *notify;
-
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (state == NFCT_ECACHE_DESTROY_FAIL &&
+ !delayed_work_pending(&cnet->ecache.dwork)) {
+ schedule_delayed_work(&cnet->ecache.dwork, HZ);
+ net->ct.ecache_dwork_pending = true;
+ } else if (state == NFCT_ECACHE_DESTROY_SENT) {
+ if (!hlist_nulls_empty(&cnet->ecache.dying_list))
+ mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0);
+ else
+ net->ct.ecache_dwork_pending = false;
}
- rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
- ret = 0;
-
-out_unlock:
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
-void nf_ct_expect_unregister_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e)
{
- struct nf_exp_event_notifier *notify;
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ u64 ts = 0;
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
- RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
- mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ ts = ktime_get_real_ns();
+
+ local64_set(&e->timestamp, ts);
+#endif
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
-#define NF_CT_EVENTS_DEFAULT 1
-static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *e;
-static const struct nf_ct_ext_type event_extend = {
- .len = sizeof(struct nf_conntrack_ecache),
- .align = __alignof__(struct nf_conntrack_ecache),
- .id = NF_CT_EXT_ECACHE,
-};
+ switch (net->ct.sysctl_events) {
+ case 0:
+ /* assignment via template / ruleset? ignore sysctl. */
+ if (ctmask || expmask)
+ break;
+ return true;
+ case 2: /* autodetect: no event listener, don't allocate extension. */
+ if (!READ_ONCE(nf_ctnetlink_has_listener))
+ return true;
+ fallthrough;
+ case 1:
+ /* always allocate an extension. */
+ if (!ctmask && !expmask) {
+ ctmask = ~0;
+ expmask = ~0;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return true;
+ }
-void nf_conntrack_ecache_pernet_init(struct net *net)
-{
- net->ct.sysctl_events = nf_ct_events;
- INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work);
-}
+ e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
+ if (e) {
+ nf_ct_ecache_tstamp_new(ct, e);
+ e->ctmask = ctmask;
+ e->expmask = expmask;
+ }
-void nf_conntrack_ecache_pernet_fini(struct net *net)
-{
- cancel_delayed_work_sync(&net->ct.ecache_dwork);
+ return e != NULL;
}
+EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
-int nf_conntrack_ecache_init(void)
+#define NF_CT_EVENTS_DEFAULT 2
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+
+void nf_conntrack_ecache_pernet_init(struct net *net)
{
- int ret = nf_ct_extend_register(&event_extend);
- if (ret < 0)
- pr_err("Unable to register event extension\n");
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
+ net->ct.sysctl_events = nf_ct_events;
- return ret;
+ INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
+ INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
+ spin_lock_init(&cnet->ecache.dying_lock);
+
+ BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
-void nf_conntrack_ecache_fini(void)
+void nf_conntrack_ecache_pernet_fini(struct net *net)
{
- nf_ct_extend_unregister(&event_extend);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ cancel_delayed_work_sync(&cnet->ecache.dwork);
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 42557d2b6a90..cfc2daa3fc7f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -17,7 +17,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
@@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
-static unsigned int nf_ct_expect_hashrnd __read_mostly;
+static siphash_aligned_key_t nf_ct_expect_hashrnd;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -49,12 +49,15 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
+ struct nf_conntrack_net *cnet;
WARN_ON(!master_help);
WARN_ON(timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
- net->ct.expect_count--;
+
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count--;
hlist_del_rcu(&exp->lnode);
master_help->expecting[exp->class]--;
@@ -68,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
static void nf_ct_expectation_timed_out(struct timer_list *t)
{
- struct nf_conntrack_expect *exp = from_timer(exp, t, timeout);
+ struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout);
spin_lock_bh(&nf_conntrack_expect_lock);
nf_ct_unlink_expect(exp);
@@ -78,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t)
static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash, seed;
+ struct {
+ union nf_inet_addr dst_addr;
+ u32 net_mix;
+ u16 dport;
+ u8 l3num;
+ u8 protonum;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
+ u32 hash;
get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
- seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
+ memset(&combined, 0, sizeof(combined));
+
+ combined.dst_addr = tuple->dst.u3;
+ combined.net_mix = net_hash_mix(n);
+ combined.dport = (__force __u16)tuple->dst.u.all;
+ combined.l3num = tuple->src.l3num;
+ combined.protonum = tuple->dst.protonum;
- hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
- (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ seed);
+ hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
@@ -104,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
{
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
return true;
@@ -118,10 +132,11 @@ __nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -156,12 +171,13 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
struct nf_conntrack_expect *
nf_ct_find_expectation(struct net *net,
const struct nf_conntrack_zone *zone,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple, bool unlink)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i, *exp = NULL;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -187,22 +203,22 @@ nf_ct_find_expectation(struct net *net,
* about to invoke ->destroy(), or nf_ct_delete() via timeout
* or early_drop().
*
- * The atomic_inc_not_zero() check tells: If that fails, we
+ * The refcount_inc_not_zero() check tells: If that fails, we
* know that the ct is being destroyed. If it succeeds, we
* can be sure the ct cannot disappear underneath.
*/
if (unlikely(nf_ct_is_dying(exp->master) ||
- !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ !refcount_inc_not_zero(&exp->master->ct_general.use)))
return NULL;
- if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+ if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) {
refcount_inc(&exp->use);
return exp;
- } else if (del_timer(&exp->timeout)) {
+ } else if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
return exp;
}
- /* Undo exp->master refcnt increase, if del_timer() failed */
+ /* Undo exp->master refcnt increase, if timer_delete() failed */
nf_ct_put(exp->master);
return NULL;
@@ -368,6 +384,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put);
static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
+ struct nf_conntrack_net *cnet;
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
@@ -389,7 +406,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
- net->ct.expect_count++;
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count++;
NF_CT_STAT_INC(net, expect_create);
}
@@ -415,6 +433,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
{
const struct nf_conntrack_expect_policy *p;
struct nf_conntrack_expect *i;
+ struct nf_conntrack_net *cnet;
struct nf_conn *master = expect->master;
struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_helper *helper;
@@ -458,7 +477,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
}
}
- if (net->ct.expect_count >= nf_ct_expect_max) {
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count >= nf_ct_expect_max) {
net_warn_ratelimited("nf_conntrack: expectation table full\n");
ret = -EMFILE;
}
@@ -500,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo
hlist_for_each_entry_safe(exp, next,
&nf_ct_expect_hash[i],
hnode) {
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
@@ -530,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net,
if (!net_eq(nf_ct_exp_net(exp), net))
continue;
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, portid, report);
nf_ct_expect_put(exp);
}
@@ -686,7 +706,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- net->ct.expect_count = 0;
return exp_proc_init(net);
}
@@ -703,9 +722,7 @@ int nf_conntrack_expect_init(void)
nf_ct_expect_hsize = 1;
}
nf_ct_expect_max = nf_ct_expect_hsize * 4;
- nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
- sizeof(struct nf_conntrack_expect),
- 0, 0, NULL);
+ nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0);
if (!nf_ct_expect_cachep)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 3dbe2329c3f1..dd62cc12e775 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -13,40 +13,92 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
-void nf_ct_ext_destroy(struct nf_conn *ct)
+atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
+
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+ [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+ [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+ [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+ [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
{
- unsigned int i;
- struct nf_ct_ext_type *t;
-
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
-
- /* Here the nf_ct_ext_type might have been unregisterd.
- * I.e., it has responsible to cleanup private
- * area in all conntracks when it is unregisterd.
- */
- if (t && t->destroy)
- t->destroy(ct);
- rcu_read_unlock();
- }
-
- kfree(ct->ext);
+ /* remember to add new extensions below */
+ BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+ return sizeof(struct nf_ct_ext) +
+ sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ + sizeof(struct nf_conn_nat)
+#endif
+ + sizeof(struct nf_conn_seqadj)
+ + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ + sizeof(struct nf_conn_act_ct_ext)
+#endif
+ ;
}
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext_type *t;
struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
+ /* struct nf_ct_ext uses u8 to store offsets/size */
+ BUILD_BUG_ON(total_extension_size() > 255u);
if (ct->ext) {
const struct nf_ct_ext *old = ct->ext;
@@ -58,24 +110,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
oldlen = sizeof(*new);
}
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[id]);
- if (!t) {
- rcu_read_unlock();
- return NULL;
- }
-
- newoff = ALIGN(oldlen, t->align);
- newlen = newoff + t->len;
- rcu_read_unlock();
+ newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+ newlen = newoff + nf_ct_ext_type_len[id];
alloc = max(newlen, NF_CT_EXT_PREALLOC);
new = krealloc(ct->ext, alloc, gfp);
if (!new)
return NULL;
- if (!ct->ext)
+ if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
+ new->gen_id = atomic_read(&nf_conntrack_ext_genid);
+ }
new->offset[id] = newoff;
new->len = newlen;
@@ -86,30 +132,28 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
}
EXPORT_SYMBOL(nf_ct_ext_add);
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
+/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
- int ret = 0;
+ unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
+ unsigned int this_id = READ_ONCE(ext->gen_id);
- mutex_lock(&nf_ct_ext_type_mutex);
- if (nf_ct_ext_types[type->id]) {
- ret = -EBUSY;
- goto out;
- }
+ if (!__nf_ct_ext_exist(ext, id))
+ return NULL;
- rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
- mutex_unlock(&nf_ct_ext_type_mutex);
- return ret;
+ if (this_id == 0 || ext->gen_id == gen_id)
+ return (void *)ext + ext->offset[id];
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+EXPORT_SYMBOL(__nf_ct_ext_find);
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
+void nf_ct_ext_bump_genid(void)
{
- mutex_lock(&nf_ct_ext_type_mutex);
- RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
- mutex_unlock(&nf_ct_ext_type_mutex);
- synchronize_rcu();
+ unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
+
+ if (value == UINT_MAX)
+ atomic_set(&nf_conntrack_ext_genid, 1);
+
+ msleep(HZ);
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 9eca90414bb7..617f744a2e3a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -33,10 +33,6 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
MODULE_ALIAS("ip_conntrack_ftp");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-
-/* This is slow, but it's simple. --RR */
-static char *ftp_buffer;
-
static DEFINE_SPINLOCK(nf_ftp_lock);
#define MAX_PORTS 8
@@ -382,7 +378,7 @@ static int help(struct sk_buff *skb,
int ret;
u32 seq;
int dir = CTINFO2DIR(ctinfo);
- unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+ unsigned int matchlen, matchoff;
struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
union nf_inet_addr *daddr;
@@ -398,6 +394,9 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
}
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
return NF_ACCEPT;
@@ -411,9 +410,9 @@ static int help(struct sk_buff *skb,
}
datalen = skb->len - dataoff;
+ /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */
spin_lock_bh(&nf_ftp_lock);
- fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
- BUG_ON(fb_ptr == NULL);
+ fb_ptr = skb->data + dataoff;
ends_in_nl = (fb_ptr[datalen - 1] == '\n');
seq = ntohl(th->seq) + datalen;
@@ -568,7 +567,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
- kfree(ftp_buffer);
}
static int __init nf_conntrack_ftp_init(void)
@@ -577,10 +575,6 @@ static int __init nf_conntrack_ftp_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master));
- ftp_buffer = kmalloc(65536, GFP_KERNEL);
- if (!ftp_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
@@ -600,7 +594,6 @@ static int __init nf_conntrack_ftp_init(void)
ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(ftp_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 573cb4481481..540d97715bd2 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -257,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b)
case 4:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 3:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 2:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 1:
v |= *bs->cur++;
break;
@@ -533,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
/* Get fields bitmap */
if (nf_h323_error_boundary(bs, 0, f->sz))
return H323_ERROR_BOUND;
+ if (f->sz > 32)
+ return H323_ERROR_RANGE;
bmp = get_bitmap(bs, f->sz);
if (base)
*(unsigned int *)base = bmp;
@@ -589,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
bmp2_len = get_bits(bs, 7) + 1;
if (nf_h323_error_boundary(bs, 0, bmp2_len))
return H323_ERROR_BOUND;
+ if (bmp2_len > 32)
+ return H323_ERROR_RANGE;
bmp2 = get_bitmap(bs, bmp2_len);
bmp |= bmp2 >> f->sz;
if (base)
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 8ba037b76ad3..14f73872f647 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -34,6 +34,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_h323.h>
+#define H323_MAX_SIZE 65535
+
/* Parameters */
static unsigned int default_rrq_ttl __read_mostly = 300;
module_param(default_rrq_ttl, uint, 0600);
@@ -49,64 +51,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
"if both endpoints are on different sides "
"(determined by routing information)");
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_sig_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*set_ras_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- __be16 port, __be16 rtp_port,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp) __read_mostly;
-int (*nat_t120_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_h245_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_callforwarding_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_q931_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, TransportAddress *taddr, int idx,
- __be16 port, struct nf_conntrack_expect *exp)
- __read_mostly;
+const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfct_h323_nat_hook);
static DEFINE_SPINLOCK(nf_h323_lock);
static char *h323_buffer;
@@ -142,11 +88,15 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen <= 0) /* No TCP data */
goto clear_out;
+ if (tcpdatalen > H323_MAX_SIZE)
+ tcpdatalen = H323_MAX_SIZE;
+
if (*data == NULL) { /* first TPKT */
/* Get first TPKT pointer */
tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
h323_buffer);
- BUG_ON(tpkt == NULL);
+ if (!tpkt)
+ goto clear_out;
/* Validate TPKT identifier */
if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
@@ -193,7 +143,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen == 4) { /* Separate TPKT header */
/* Netmeeting sends TPKT header and data separately */
pr_debug("nf_ct_h323: separate TPKT header indicates "
- "there will be TPKT data of %hu bytes\n",
+ "there will be TPKT data of %d bytes\n",
tpktlen - 4);
info->tpkt_len[dir] = tpktlen - 4;
return 0;
@@ -258,6 +208,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
@@ -265,7 +216,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
union nf_inet_addr addr;
struct nf_conntrack_expect *rtp_exp;
struct nf_conntrack_expect *rtcp_exp;
- typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
/* Read RTP or RTCP address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -295,15 +245,16 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_UDP, NULL, &rtcp_port);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
- taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(rtp_exp, 0) == 0) {
if (nf_ct_expect_related(rtcp_exp, 0) == 0) {
@@ -332,12 +283,12 @@ static int expect_t120(struct sk_buff *skb,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_t120_hook) nat_t120;
/* Read T.120 address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -354,15 +305,16 @@ static int expect_t120(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_h323: expect T.120 ");
@@ -663,18 +615,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
return 1;
}
+EXPORT_SYMBOL_GPL(get_h225_addr);
static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_h245_hook) nat_h245;
/* Read h245Address */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
@@ -691,15 +644,16 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
IPPROTO_TCP, NULL, &port);
exp->helper = &nf_conntrack_helper_h245;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect H.245 ");
@@ -784,13 +738,13 @@ static int expect_callforwarding(struct sk_buff *skb,
unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
struct net *net = nf_ct_net(ct);
- typeof(nat_callforwarding_hook) nat_callforwarding;
/* Read alternativeAddress */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
@@ -814,16 +768,17 @@ static int expect_callforwarding(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->helper = nf_conntrack_helper_q931;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
- ret = nat_callforwarding(skb, ct, ctinfo,
- protoff, data, dataoff,
- taddr, port, exp);
+ ret = nathook->nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect Call Forwarding ");
@@ -843,12 +798,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
Setup_UUIE *setup)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
int i;
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_q931: Setup\n");
@@ -859,9 +814,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
return -1;
}
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
@@ -869,16 +824,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.src.u3,
- ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
if (ret < 0)
return -1;
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
@@ -886,10 +841,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->sourceCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
if (ret < 0)
return -1;
}
@@ -1219,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NULL;
*datalen = skb->len - dataoff;
+ if (*datalen > H323_MAX_SIZE)
+ *datalen = H323_MAX_SIZE;
+
return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
}
@@ -1248,13 +1206,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
TransportAddress *taddr, int count)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
int i;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_q931_hook) nat_q931;
/* Look for the first related address */
for (i = 0; i < count; i++) {
@@ -1278,11 +1236,11 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->helper = nf_conntrack_helper_q931;
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
- nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) { /* Need NAT */
- ret = nat_q931(skb, ct, ctinfo, protoff, data,
- taddr, i, port, exp);
+ ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
} else { /* Conntrack only */
if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
@@ -1304,15 +1262,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, GatekeeperRequest *grq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: GRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) /* NATed */
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &grq->rasAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
return 0;
}
@@ -1366,8 +1324,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationRequest *rrq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
pr_debug("nf_ct_ras: RRQ\n");
@@ -1377,12 +1335,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
if (ret < 0)
return -1;
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- rrq->rasAddress.item,
- rrq->rasAddress.count);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
if (ret < 0)
return -1;
}
@@ -1402,19 +1360,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationConfirm *rcf)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: RCF\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- rcf->callSignalAddress.item,
- rcf->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1427,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
if (info->timeout > 0) {
pr_debug("nf_ct_ras: set RAS connection timeout to "
"%u seconds\n", info->timeout);
- nf_ct_refresh(ct, skb, info->timeout * HZ);
+ nf_ct_refresh(ct, info->timeout * HZ);
/* Set expect timeout */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -1453,18 +1411,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, UnregistrationRequest *urq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: URQ\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- urq->callSignalAddress.item,
- urq->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1475,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
info->sig_port[!dir] = 0;
/* Give it 30 seconds for UCF or URJ */
- nf_ct_refresh(ct, skb, 30 * HZ);
+ nf_ct_refresh(ct, 30 * HZ);
return 0;
}
@@ -1486,39 +1444,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, AdmissionRequest *arq)
{
const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_ras: ARQ\n");
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (!nathook)
+ return 0;
+
if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->destCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- info->sig_port[!dir]);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
}
if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->srcCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- port);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
}
return 0;
@@ -1534,7 +1495,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: ACF\n");
@@ -1543,12 +1503,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
return 0;
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ const struct nfct_h323_nat_hooks *nathook;
+
/* Answering ACF */
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_sig_addr(skb, ct, ctinfo, protoff, data,
- &acf->destCallSignalAddress, 1);
+ return nathook->set_sig_addr(skb, ct, ctinfo, protoff,
+ data,
+ &acf->destCallSignalAddress, 1);
return 0;
}
@@ -1577,15 +1540,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, LocationRequest *lrq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: LRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &lrq->replyAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
return 0;
}
@@ -1633,27 +1596,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, InfoRequestResponse *irr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: IRR\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- &irr->rasAddress, 1);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
if (ret < 0)
return -1;
- }
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- irr->callSignalAddress.item,
- irr->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1820,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master));
- h323_buffer = kmalloc(65536, GFP_KERNEL);
+ h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL);
if (!h323_buffer)
return -ENOMEM;
ret = h323_helper_init();
@@ -1836,17 +1794,6 @@ err1:
module_init(nf_conntrack_h323_init);
module_exit(nf_conntrack_h323_fini);
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 connection tracking helper");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 118f415928ae..ceb48c3ca0a4 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -26,7 +26,9 @@
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_log.h>
+#include <net/ip.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
struct hlist_head *nf_ct_helper_hash __read_mostly;
@@ -35,11 +37,6 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = false;
-module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
-MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 0)");
-
static DEFINE_MUTEX(nf_ct_nat_helpers_mutex);
static struct list_head nf_ct_nat_helpers __read_mostly;
@@ -51,24 +48,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
}
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
- unsigned int h;
-
- if (!nf_ct_helper_count)
- return NULL;
-
- h = helper_hash(tuple);
- hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
- if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
- return helper;
- }
- return NULL;
-}
-
struct nf_conntrack_helper *
__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
{
@@ -165,7 +144,7 @@ nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
if (!nat) {
snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name);
rcu_read_unlock();
- request_module(mod_name);
+ request_module("%s", mod_name);
rcu_read_lock();
nat = nf_conntrack_nat_helper_find(mod_name);
@@ -209,59 +188,31 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
-static struct nf_conntrack_helper *
-nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
-{
- if (!net->ct.sysctl_auto_assign_helper) {
- if (net->ct.auto_assign_helper_warned)
- return NULL;
- if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
- return NULL;
- pr_info("nf_conntrack: default automatic helper assignment "
- "has been turned off for security reasons and CT-based "
- " firewall rule not found. Use the iptables CT target "
- "to attach helpers instead.\n");
- net->ct.auto_assign_helper_warned = 1;
- return NULL;
- }
-
- return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-}
-
-
int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
gfp_t flags)
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
- struct net *net = nf_ct_net(ct);
- /* We already got a helper explicitly attached. The function
- * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
- * the helper up again. Since now the user is in full control of
- * making consistent helper configurations, skip this automatic
- * re-lookup, otherwise we'll lose the helper.
- */
+ /* We already got a helper explicitly attached (e.g. nft_ct) */
if (test_bit(IPS_HELPER_BIT, &ct->status))
return 0;
- if (tmpl != NULL) {
- help = nfct_help(tmpl);
- if (help != NULL) {
- helper = help->helper;
- set_bit(IPS_HELPER_BIT, &ct->status);
- }
+ if (WARN_ON_ONCE(!tmpl))
+ return 0;
+
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = rcu_dereference(help->helper);
+ set_bit(IPS_HELPER_BIT, &ct->status);
}
help = nfct_help(ct);
if (helper == NULL) {
- helper = nf_ct_lookup_helper(ct, net);
- if (helper == NULL) {
- if (help)
- RCU_INIT_POINTER(help->helper, NULL);
- return 0;
- }
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ return 0;
}
if (help == NULL) {
@@ -404,6 +355,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+ if (!nf_ct_helper_hash)
+ return -ENOENT;
+
if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
return -EINVAL;
@@ -414,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
(cur->tuple.src.l3num == NFPROTO_UNSPEC ||
cur->tuple.src.l3num == me->tuple.src.l3num) &&
cur->tuple.dst.protonum == me->tuple.dst.protonum) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -425,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple,
&mask)) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -467,11 +421,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
-
- /* Maybe someone has gotten the helper already when unhelp above.
- * So need to wait it.
- */
- synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
@@ -549,42 +498,20 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
}
EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
-static const struct nf_ct_ext_type helper_extend = {
- .len = sizeof(struct nf_conn_help),
- .align = __alignof__(struct nf_conn_help),
- .id = NF_CT_EXT_HELPER,
-};
-
-void nf_conntrack_helper_pernet_init(struct net *net)
-{
- net->ct.auto_assign_helper_warned = false;
- net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
-}
-
int nf_conntrack_helper_init(void)
{
- int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
nf_ct_helper_hash =
nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- ret = nf_ct_extend_register(&helper_extend);
- if (ret < 0) {
- pr_err("nf_ct_helper: Unable to register helper extension.\n");
- goto out_extend;
- }
-
INIT_LIST_HEAD(&nf_ct_nat_helpers);
return 0;
-out_extend:
- kvfree(nf_ct_helper_hash);
- return ret;
}
void nf_conntrack_helper_fini(void)
{
- nf_ct_extend_unregister(&helper_extend);
kvfree(nf_ct_helper_hash);
+ nf_ct_helper_hash = NULL;
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index e40988a2f22f..5703846bea3b 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -39,6 +39,7 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
#define HELPER_NAME "irc"
+#define MAX_SEARCH_SIZE 4095
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
@@ -121,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
int i, ret = NF_ACCEPT;
char *addr_beg_p, *addr_end_p;
typeof(nf_nat_irc_hook) nf_nat_irc;
+ unsigned int datalen;
/* If packet is coming from IRC server */
if (dir == IP_CT_DIR_REPLY)
@@ -140,23 +142,52 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
+ datalen = skb->len - dataoff;
+ if (datalen > MAX_SEARCH_SIZE)
+ datalen = MAX_SEARCH_SIZE;
+
spin_lock_bh(&irc_buffer_lock);
- ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ ib_ptr = skb_header_pointer(skb, dataoff, datalen,
irc_buffer);
- BUG_ON(ib_ptr == NULL);
+ if (!ib_ptr) {
+ spin_unlock_bh(&irc_buffer_lock);
+ return NF_ACCEPT;
+ }
data = ib_ptr;
- data_limit = ib_ptr + skb->len - dataoff;
+ data_limit = ib_ptr + datalen;
+
+ /* Skip any whitespace */
+ while (data < data_limit - 10) {
+ if (*data == ' ' || *data == '\r' || *data == '\n')
+ data++;
+ else
+ break;
+ }
+
+ /* strlen("PRIVMSG x ")=10 */
+ if (data < data_limit - 10) {
+ if (strncasecmp("PRIVMSG ", data, 8))
+ goto out;
+ data += 8;
+ }
- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
- while (data < data_limit - (19 + MINMATCHLEN)) {
- if (memcmp(data, "\1DCC ", 5)) {
+ /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26
+ * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26
+ */
+ while (data < data_limit - (21 + MINMATCHLEN)) {
+ /* Find first " :", the start of message */
+ if (memcmp(data, " :", 2)) {
data++;
continue;
}
+ data += 2;
+
+ /* then check that place only for the DCC command */
+ if (memcmp(data, "\1DCC ", 5))
+ goto out;
data += 5;
- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+ /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */
iph = ip_hdr(skb);
pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
@@ -172,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
pr_debug("DCC %s detected\n", dccprotos[i]);
/* we have at least
- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid
* data left (== 14/13 bytes) */
if (parse_dcc(data, data_limit, &dcc_ip,
&dcc_port, &addr_beg_p, &addr_end_p)) {
@@ -185,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
/* dcc_ip can be the internal OR external (NAT'ed) IP */
tuple = &ct->tuplehash[dir].tuple;
- if (tuple->src.u3.ip != dcc_ip &&
- tuple->dst.u3.ip != dcc_ip) {
+ if ((tuple->src.u3.ip != dcc_ip &&
+ ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) ||
+ dcc_port == 0) {
net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
&tuple->src.u3.ip,
&dcc_ip, dcc_port);
@@ -248,7 +280,7 @@ static int __init nf_conntrack_irc_init(void)
irc_exp_policy.max_expected = max_dcc_channels;
irc_exp_policy.timeout = dcc_timeout;
- irc_buffer = kmalloc(65536, GFP_KERNEL);
+ irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL);
if (!irc_buffer)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index 522792556632..6c46aad23313 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -11,8 +11,6 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
-static DEFINE_SPINLOCK(nf_connlabels_lock);
-
static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -60,12 +58,15 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
int nf_connlabels_get(struct net *net, unsigned int bits)
{
+ int v;
+
if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used++;
- spin_unlock(&nf_connlabels_lock);
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
+ v = atomic_inc_return_relaxed(&net->ct.labels_used);
+ WARN_ON_ONCE(v <= 0);
return 0;
}
@@ -73,26 +74,8 @@ EXPORT_SYMBOL_GPL(nf_connlabels_get);
void nf_connlabels_put(struct net *net)
{
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used--;
- spin_unlock(&nf_connlabels_lock);
-}
-EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
- .len = sizeof(struct nf_conn_labels),
- .align = __alignof__(struct nf_conn_labels),
- .id = NF_CT_EXT_LABELS,
-};
-
-int nf_conntrack_labels_init(void)
-{
- BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+ int v = atomic_dec_return_relaxed(&net->ct.labels_used);
- return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
- nf_ct_extend_unregister(&labels_extend);
+ WARN_ON_ONCE(v < 0);
}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 7f19ee259609..55415f011943 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -20,13 +20,14 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#define HELPER_NAME "netbios-ns"
#define NMBD_PORT 137
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_netbios_ns");
-MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static unsigned int timeout __read_mostly = 3;
module_param(timeout, uint, 0400);
@@ -44,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
}
static struct nf_conntrack_helper helper __read_mostly = {
- .name = "netbios-ns",
+ .name = HELPER_NAME,
.tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d7bd8b1f27d5..3a04665adf99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -57,6 +57,13 @@
#include "nf_internals.h"
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("List and change connection tracking table");
+
+struct ctnetlink_list_dump_ctx {
+ unsigned long last_id;
+ unsigned int cpu;
+ bool done;
+};
static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
@@ -167,9 +174,18 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
+ bool skip_zero)
{
- long timeout = nf_ct_expires(ct) / HZ;
+ long timeout;
+
+ if (nf_ct_is_confirmed(ct))
+ timeout = nf_ct_expires(ct) / HZ;
+ else
+ timeout = ct->timeout / HZ;
+
+ if (skip_zero && timeout == 0)
+ return 0;
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
@@ -179,7 +195,8 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct,
+ bool destroy)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -193,7 +210,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
if (!nest_proto)
goto nla_put_failure;
- ret = l4proto->to_nlattr(skb, nest_proto, ct);
+ ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy);
nla_nest_end(skb, nest_proto);
@@ -213,6 +230,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
if (!help)
return 0;
+ rcu_read_lock();
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
@@ -228,9 +246,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
nla_nest_end(skb, nest_helper);
out:
+ rcu_read_unlock();
return 0;
nla_put_failure:
+ rcu_read_unlock();
return -1;
}
@@ -314,9 +334,15 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct,
+ bool dump)
{
- if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ u32 mark = READ_ONCE(ct->mark);
+
+ if (!mark && !dump)
+ return 0;
+
+ if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
goto nla_put_failure;
return 0;
@@ -324,18 +350,18 @@ nla_put_failure:
return -1;
}
#else
-#define ctnetlink_dump_mark(a, b) (0)
+#define ctnetlink_dump_mark(a, b, c) (0)
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
- int len, ret;
- char *secctx;
+ struct lsm_context ctx;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return 0;
ret = -1;
@@ -343,20 +369,37 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
if (!nest_secctx)
goto nla_put_failure;
- if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context))
goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
nla_put_failure:
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
return ret;
}
#else
#define ctnetlink_dump_secctx(a, b) (0)
#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct);
+
+ if (e) {
+ u64 ts = local64_read(&e->timestamp);
+
+ if (ts)
+ return nla_put_be64(skb, CTA_TIMESTAMP_EVENT,
+ cpu_to_be64(ts), CTA_TIMESTAMP_PAD);
+ }
+#endif
+ return 0;
+}
+
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -365,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
return 0;
return nla_total_size(sizeof(labels->bits));
}
+#endif
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
@@ -385,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-#else
-#define ctnetlink_dump_labels(a, b) (0)
-#define ctnetlink_label_size(a) (0)
-#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -500,7 +540,7 @@ nla_put_failure:
static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use))))
goto nla_put_failure;
return 0;
@@ -529,7 +569,7 @@ static int ctnetlink_dump_extinfo(struct sk_buff *skb,
static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
{
if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct, true) < 0 ||
ctnetlink_dump_secctx(skb, ct) < 0 ||
ctnetlink_dump_id(skb, ct) < 0 ||
ctnetlink_dump_use(skb, ct) < 0 ||
@@ -537,8 +577,8 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
return -1;
if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
- (ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0))
+ (ctnetlink_dump_timeout(skb, ct, false) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0))
return -1;
return 0;
@@ -550,22 +590,17 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
{
const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
unsigned int event;
if (portid)
flags |= NLM_F_MULTI;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -631,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
return len + len4;
}
-#endif
static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
@@ -646,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- int len, ret;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, NULL, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, NULL);
+ if (ret < 0)
return 0;
return nla_total_size(0) /* CTA_SECCTX */
- + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */
#else
return 0;
#endif
@@ -669,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
return 0;
#endif
}
+#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
@@ -699,16 +734,18 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */
+#endif
;
}
static int
-ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
{
const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
struct nf_conn *ct = item->ct;
struct sk_buff *skb;
@@ -738,15 +775,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
@@ -780,15 +813,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_timeout(skb, ct, true) < 0)
+ goto nla_put_failure;
+
if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0)
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, true) < 0)
goto nla_put_failure;
} else {
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (events & (1 << IPCT_PROTOINFO)
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (events & (1 << IPCT_PROTOINFO) &&
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
@@ -818,10 +855,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((events & (1 << IPCT_MARK) || ct->mark)
- && ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK)))
goto nla_put_failure;
#endif
+
+ if (ctnetlink_dump_event_timestamp(skb, ct))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
@@ -844,15 +884,18 @@ errout:
static int ctnetlink_done(struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
kfree(cb->data);
return 0;
}
+struct ctnetlink_filter_u32 {
+ u32 val;
+ u32 mask;
+};
+
struct ctnetlink_filter {
- u_int32_t cta_flags;
u8 family;
+ bool zone_filter;
u_int32_t orig_flags;
u_int32_t reply_flags;
@@ -861,10 +904,8 @@ struct ctnetlink_filter {
struct nf_conntrack_tuple reply;
struct nf_conntrack_zone zone;
- struct {
- u_int32_t val;
- u_int32_t mask;
- } mark;
+ struct ctnetlink_filter_u32 mark;
+ struct ctnetlink_filter_u32 status;
};
static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
@@ -906,9 +947,45 @@ static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
struct nf_conntrack_zone *zone,
u_int32_t flags);
-/* applied on filters */
-#define CTA_FILTER_F_CTA_MARK (1 << 0)
-#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1)
+static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
+ const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK]) {
+ mark->val = ntohl(nla_get_be32(cda[CTA_MARK]));
+
+ if (cda[CTA_MARK_MASK])
+ mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ else
+ mark->mask = 0xffffffff;
+ } else if (cda[CTA_MARK_MASK]) {
+ return -EINVAL;
+ }
+#endif
+ return 0;
+}
+
+static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
+ const struct nlattr * const cda[])
+{
+ if (cda[CTA_STATUS]) {
+ status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
+ if (cda[CTA_STATUS_MASK])
+ status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
+ else
+ status->mask = status->val;
+
+ /* status->val == 0? always true, else always false. */
+ if (status->mask == 0)
+ return -EINVAL;
+ } else if (cda[CTA_STATUS_MASK]) {
+ return -EINVAL;
+ }
+
+ /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
+ BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
+ return 0;
+}
static struct ctnetlink_filter *
ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
@@ -927,35 +1004,33 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
filter->family = family;
-#ifdef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK]) {
- filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
- filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK);
+ err = ctnetlink_filter_parse_mark(&filter->mark, cda);
+ if (err)
+ goto err_filter;
- if (cda[CTA_MARK_MASK]) {
- filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
- filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK);
- } else {
- filter->mark.mask = 0xffffffff;
- }
- } else if (cda[CTA_MARK_MASK]) {
- return ERR_PTR(-EINVAL);
+ err = ctnetlink_filter_parse_status(&filter->status, cda);
+ if (err)
+ goto err_filter;
+
+ if (cda[CTA_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
+ if (err < 0)
+ goto err_filter;
+ filter->zone_filter = true;
}
-#endif
+
if (!cda[CTA_FILTER])
return filter;
- err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
- if (err < 0)
- return ERR_PTR(err);
-
err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
if (err < 0)
- return ERR_PTR(err);
+ goto err_filter;
if (filter->orig_flags) {
- if (!cda[CTA_TUPLE_ORIG])
- return ERR_PTR(-EINVAL);
+ if (!cda[CTA_TUPLE_ORIG]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
err = ctnetlink_parse_tuple_filter(cda, &filter->orig,
CTA_TUPLE_ORIG,
@@ -963,28 +1038,35 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
&filter->zone,
filter->orig_flags);
if (err < 0)
- return ERR_PTR(err);
+ goto err_filter;
}
if (filter->reply_flags) {
- if (!cda[CTA_TUPLE_REPLY])
- return ERR_PTR(-EINVAL);
+ if (!cda[CTA_TUPLE_REPLY]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
err = ctnetlink_parse_tuple_filter(cda, &filter->reply,
CTA_TUPLE_REPLY,
filter->family,
&filter->zone,
- filter->orig_flags);
+ filter->reply_flags);
if (err < 0)
- return ERR_PTR(err);
+ goto err_filter;
}
return filter;
+
+err_filter:
+ kfree(filter);
+
+ return ERR_PTR(err);
}
static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
{
- return family || cda[CTA_MARK] || cda[CTA_FILTER];
+ return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE];
}
static int ctnetlink_start(struct netlink_callback *cb)
@@ -1077,6 +1159,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
{
struct ctnetlink_filter *filter = data;
struct nf_conntrack_tuple *tuple;
+ u32 status;
if (filter == NULL)
goto out;
@@ -1088,6 +1171,10 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
if (filter->family && nf_ct_l3num(ct) != filter->family)
goto ignore_entry;
+ if (filter->zone_filter &&
+ !nf_ct_zone_equal_any(ct, &filter->zone))
+ goto ignore_entry;
+
if (filter->orig_flags) {
tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
@@ -1105,13 +1192,12 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) &&
- (ct->mark & filter->mark.mask) != filter->mark.val)
- goto ignore_entry;
- else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) &&
- ct->mark != filter->mark.val)
+ if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val)
goto ignore_entry;
#endif
+ status = (u32)READ_ONCE(ct->status);
+ if ((status & filter->status.mask) != filter->status.val)
+ goto ignore_entry;
out:
return 1;
@@ -1120,19 +1206,26 @@ ignore_entry:
return 0;
}
+static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
+{
+ unsigned long id = nf_ct_get_id(ct);
+
+ return id ? id : 1;
+}
+
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
struct net *net = sock_net(skb->sk);
- struct nf_conn *ct, *last;
+ unsigned long last_id = cb->args[1];
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nf_conn *nf_ct_evict[8];
+ struct nf_conn *ct;
int res, i;
spinlock_t *lockp;
- last = (struct nf_conn *)cb->args[1];
i = 0;
local_bh_disable();
@@ -1153,12 +1246,11 @@ restart:
}
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
+ /* need to defer nf_ct_kill() until lock is released */
if (i < ARRAY_SIZE(nf_ct_evict) &&
- atomic_inc_not_zero(&ct->ct_general.use))
+ refcount_inc_not_zero(&ct->ct_general.use))
nf_ct_evict[i++] = ct;
continue;
}
@@ -1166,8 +1258,11 @@ restart:
if (!net_eq(net, nf_ct_net(ct)))
continue;
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+
if (cb->args[1]) {
- if (ct != last)
+ if (ctnetlink_get_id(ct) != last_id)
continue;
cb->args[1] = 0;
}
@@ -1180,8 +1275,7 @@ restart:
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, true, flags);
if (res < 0) {
- nf_conntrack_get(&ct->ct_general);
- cb->args[1] = (unsigned long)ct;
+ cb->args[1] = ctnetlink_get_id(ct);
spin_unlock(lockp);
goto out;
}
@@ -1194,12 +1288,10 @@ restart:
}
out:
local_bh_enable();
- if (last) {
+ if (last_id) {
/* nf ct hash resize happened, now clear the leftover. */
- if ((struct nf_conn *)cb->args[1] == last)
+ if (cb->args[1] == last_id)
cb->args[1] = 0;
-
- nf_ct_put(last);
}
while (i) {
@@ -1261,15 +1353,11 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
struct nlattr *tb[CTA_IP_MAX+1];
int ret = 0;
- ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, NULL, NULL);
+ ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr,
+ cta_ip_nla_policy, NULL);
if (ret < 0)
return ret;
- ret = nla_validate_nested_deprecated(attr, CTA_IP_MAX,
- cta_ip_nla_policy, NULL);
- if (ret)
- return ret;
-
switch (tuple->src.l3num) {
case NFPROTO_IPV4:
ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
@@ -1392,7 +1480,8 @@ ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
if (err < 0)
return err;
-
+ if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6)
+ return -EOPNOTSUPP;
tuple->src.l3num = l3num;
if (flags & CTA_FILTER_FLAG(CTA_IP_DST) ||
@@ -1493,13 +1582,12 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
[CTA_LABELS_MASK] = { .type = NLA_BINARY,
.len = NF_CT_LABELS_MAX_SIZE },
[CTA_FILTER] = { .type = NLA_NESTED },
+ [CTA_STATUS_MASK] = { .type = NLA_U32 },
+ [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT },
};
static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
{
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- return 0;
-
return ctnetlink_filter_match(ct, data);
}
@@ -1508,68 +1596,64 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
+ struct nf_ct_iter_data iter = {
+ .net = net,
+ .portid = portid,
+ .report = report,
+ };
if (ctnetlink_needs_filter(family, cda)) {
- if (cda[CTA_FILTER])
- return -EOPNOTSUPP;
-
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
+
+ iter.data = filter;
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
+ nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
}
-static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u8 family = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
return err;
- if (cda[CTA_TUPLE_ORIG])
+ if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
- nfmsg->nfgen_family, &zone);
- else if (cda[CTA_TUPLE_REPLY])
+ family, &zone);
+ else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
- nfmsg->nfgen_family, &zone);
+ family, &zone);
else {
- u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC;
+ u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC;
- return ctnetlink_flush_conntrack(net, cda,
+ return ctnetlink_flush_conntrack(info->net, cda,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh), u3);
+ nlmsg_report(info->nlh), u3);
}
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
- nf_ct_put(ct);
- return -EBUSY;
- }
-
if (cda[CTA_ID]) {
__be32 id = nla_get_be32(cda[CTA_ID]);
@@ -1579,28 +1663,25 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
}
}
- nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh));
nf_ct_put(ct);
return 0;
}
-static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct sk_buff *skb2 = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
+ struct nf_conn *ct;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = ctnetlink_start,
.dump = ctnetlink_dump_table,
@@ -1608,7 +1689,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
.data = (void *)cda,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1627,158 +1708,144 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_put(ct);
return -ENOMEM;
}
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0);
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct,
+ true, 0);
nf_ct_put(ct);
- if (err <= 0)
- goto free;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+}
- return 0;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_dump_one_entry(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nf_conn *ct,
+ bool dying)
+{
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u8 l3proto = nfmsg->nfgen_family;
+ int res;
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ return 0;
+
+ if (ctx->last_id) {
+ if (ctnetlink_get_id(ct) != ctx->last_id)
+ return 0;
+
+ ctx->last_id = 0;
+ }
+
+ /* We can't dump extension info for the unconfirmed
+ * list because unconfirmed conntracks can have
+ * ct->ext reallocated (and thus freed).
+ *
+ * In the dying list case ct->ext can't be free'd
+ * until after we drop pcpu->lock.
+ */
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, dying, 0);
+ if (res < 0)
+ ctx->last_id = ctnetlink_get_id(ct);
+
+ return res;
}
+#endif
-static int ctnetlink_done_list(struct netlink_callback *cb)
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
return 0;
}
static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conn *ct, *last;
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ const struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_net_ecache *ecache_net;
+ unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
- int res;
- int cpu;
- struct hlist_nulls_head *list;
- struct net *net = sock_net(skb->sk);
+#endif
- if (cb->args[2])
+ if (ctx->done)
return 0;
- last = (struct nf_conn *)cb->args[1];
+ ctx->last_id = 0;
- for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
- struct ct_pcpu *pcpu;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ ecache_net = nf_conn_pernet_ecache(net);
+ spin_lock_bh(&ecache_net->dying_lock);
- if (!cpu_possible(cpu))
- continue;
+ hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
+ struct nf_conn *ct;
+ int res;
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- spin_lock_bh(&pcpu->lock);
- list = dying ? &pcpu->dying : &pcpu->unconfirmed;
-restart:
- hlist_nulls_for_each_entry(h, n, list, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (l3proto && nf_ct_l3num(ct) != l3proto)
- continue;
- if (cb->args[1]) {
- if (ct != last)
- continue;
- cb->args[1] = 0;
- }
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (last_id && last_id != ctnetlink_get_id(ct))
+ continue;
- /* We can't dump extension info for the unconfirmed
- * list because unconfirmed conntracks can have
- * ct->ext reallocated (and thus freed).
- *
- * In the dying list case ct->ext can't be free'd
- * until after we drop pcpu->lock.
- */
- res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct, dying ? true : false, 0);
- if (res < 0) {
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
- cb->args[0] = cpu;
- cb->args[1] = (unsigned long)ct;
- spin_unlock_bh(&pcpu->lock);
- goto out;
- }
- }
- if (cb->args[1]) {
- cb->args[1] = 0;
- goto restart;
+ res = ctnetlink_dump_one_entry(skb, cb, ct, true);
+ if (res < 0) {
+ spin_unlock_bh(&ecache_net->dying_lock);
+ return skb->len;
}
- spin_unlock_bh(&pcpu->lock);
+
+ last_id = 0;
}
- cb->args[2] = 1;
-out:
- if (last)
- nf_ct_put(last);
- return skb->len;
-}
+ spin_unlock_bh(&ecache_net->dying_lock);
+#endif
+ ctx->done = true;
-static int
-ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, true);
+ return skb->len;
}
-static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_dying(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
- .done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
}
-static int
-ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, false);
-}
-
-static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
- .done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
@@ -1791,7 +1858,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
const struct nlattr *attr)
__must_hold(RCU)
{
- struct nf_nat_hook *nat_hook;
+ const struct nf_nat_hook *nat_hook;
int err;
nat_hook = rcu_dereference(nf_nat_hook);
@@ -1833,45 +1900,10 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
- unsigned long off)
-{
- unsigned int bit;
-
- /* Ignore these unchangable bits */
- on &= ~IPS_UNCHANGEABLE_MASK;
- off &= ~IPS_UNCHANGEABLE_MASK;
-
- for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
- if (on & (1 << bit))
- set_bit(bit, &ct->status);
- else if (off & (1 << bit))
- clear_bit(bit, &ct->status);
- }
-}
-
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
- unsigned long d;
- unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EBUSY;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EBUSY;
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EBUSY;
-
- __ctnetlink_change_status(ct, status, 0);
- return 0;
+ return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}
static int
@@ -1947,7 +1979,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
}
if (help) {
- if (help->helper == helper) {
+ if (rcu_access_pointer(help->helper) == helper) {
/* update private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
@@ -1966,16 +1998,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
static int ctnetlink_change_timeout(struct nf_conn *ct,
const struct nlattr * const cda[])
{
- u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = nfct_time_stamp + (u32)timeout;
-
- if (test_bit(IPS_DYING_BIT, &ct->status))
- return -ETIME;
-
- return 0;
+ return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
@@ -1988,15 +2011,14 @@ static void ctnetlink_change_mark(struct nf_conn *ct,
mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
mark = ntohl(nla_get_be32(cda[CTA_MARK]));
- newmark = (ct->mark & mask) ^ mark;
- if (newmark != ct->mark)
- ct->mark = newmark;
+ newmark = (READ_ONCE(ct->mark) & mask) ^ mark;
+ if (newmark != READ_ONCE(ct->mark))
+ WRITE_ONCE(ct->mark, newmark);
}
#endif
static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
- [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
@@ -2234,11 +2256,6 @@ ctnetlink_create_conntrack(struct net *net,
if (!cda[CTA_TIMEOUT])
goto err1;
- timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = (u32)timeout + nfct_time_stamp;
-
rcu_read_lock();
if (cda[CTA_HELP]) {
char *helpname = NULL;
@@ -2282,14 +2299,10 @@ ctnetlink_create_conntrack(struct net *net,
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
- /* not in hash table yet so not strictly necessary */
+ /* disable helper auto-assignment for this entry */
+ ct->status |= IPS_HELPER;
RCU_INIT_POINTER(help->helper, helper);
}
- } else {
- /* try an implicit helper assignation */
- err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- if (err < 0)
- goto err2;
}
err = ctnetlink_setup_nat(ct, cda);
@@ -2306,6 +2319,9 @@ ctnetlink_create_conntrack(struct net *net,
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
+ timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+ __nf_ct_set_timeout(ct, timeout);
+
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
@@ -2362,12 +2378,15 @@ ctnetlink_create_conntrack(struct net *net,
err = nf_conntrack_hash_check_insert(ct);
if (err < 0)
- goto err2;
+ goto err3;
rcu_read_unlock();
return ct;
+err3:
+ if (ct->master)
+ nf_ct_put(ct->master);
err2:
rcu_read_unlock();
err1:
@@ -2375,18 +2394,15 @@ err1:
return ERR_PTR(err);
}
-static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- struct nf_conn *ct;
- u_int8_t u3 = nfmsg->nfgen_family;
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -2408,13 +2424,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
}
if (cda[CTA_TUPLE_ORIG])
- h = nf_conntrack_find_get(net, &zone, &otuple);
+ h = nf_conntrack_find_get(info->net, &zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = nf_conntrack_find_get(net, &zone, &rtuple);
+ h = nf_conntrack_find_get(info->net, &zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
enum ip_conntrack_events events;
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
@@ -2422,8 +2438,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
if (otuple.dst.protonum != rtuple.dst.protonum)
return -EINVAL;
- ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
- &rtuple, u3);
+ ct = ctnetlink_create_conntrack(info->net, &zone, cda,
+ &otuple, &rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
@@ -2446,7 +2462,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_SYNPROXY) |
events,
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_put(ct);
}
@@ -2456,7 +2472,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
err = -EEXIST;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) {
err = ctnetlink_change_conntrack(ct, cda);
if (err == 0) {
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
@@ -2468,7 +2484,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_MARK) |
(1 << IPCT_SYNPROXY),
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
}
@@ -2481,23 +2497,17 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
__u16 cpu, const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_CT_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
- nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
@@ -2505,7 +2515,11 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
- htonl(st->search_restart)))
+ htonl(st->search_restart)) ||
+ nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
+ htonl(st->clash_resolve)) ||
+ nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
+ htonl(st->chaintoolong)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -2544,17 +2558,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_ct_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -2564,21 +2576,17 @@ static int
ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct net *net)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ unsigned int nr_conntracks;
+ struct nlmsghdr *nlh;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
+ nr_conntracks = nf_conntrack_count(net);
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
@@ -2594,10 +2602,8 @@ nlmsg_failure:
return -1;
}
-static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct sk_buff *skb2;
int err;
@@ -2607,23 +2613,15 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
return -ENOMEM;
err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
sock_net(skb->sk));
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
@@ -2662,6 +2660,8 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
@@ -2676,12 +2676,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
;
}
-static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo)
-{
- return nf_ct_get(skb, ctinfo);
-}
-
static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
const struct nf_conntrack_zone *zone;
@@ -2719,10 +2713,14 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_helpinfo(skb, ct) < 0)
@@ -2743,7 +2741,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, true) < 0)
goto nla_put_failure;
#endif
if (ctnetlink_dump_labels(skb, ct) < 0)
@@ -2799,7 +2797,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
* unchangeable bits but do not error out. Also user programs
* are allowed to clear the bits that they are allowed to change.
*/
- __ctnetlink_change_status(ct, status, ~status);
+ __nf_ct_change_status(ct, status, ~status);
return 0;
}
@@ -2914,8 +2912,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
}
-static struct nfnl_ct_hook ctnetlink_glue_hook = {
- .get_ct = ctnetlink_glue_get_ct,
+static const struct nfnl_ct_hook ctnetlink_glue_hook = {
.build_size = ctnetlink_glue_build_size,
.build = ctnetlink_glue_build,
.parse = ctnetlink_glue_parse,
@@ -2959,6 +2956,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
memset(&m, 0xFF, sizeof(m));
memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
m.src.u.all = mask->src.u.all;
+ m.src.l3num = tuple->src.l3num;
m.dst.protonum = tuple->dst.protonum;
nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK);
@@ -2984,11 +2982,13 @@ nla_put_failure:
return -1;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const union nf_inet_addr any_addr;
+#endif
static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
- static __read_mostly siphash_key_t exp_id_seed;
+ static siphash_aligned_key_t exp_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
@@ -3081,19 +3081,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -3108,12 +3103,11 @@ nla_put_failure:
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
-ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
{
struct nf_conntrack_expect *exp = item->exp;
struct net *net = nf_ct_exp_net(exp);
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *skb;
unsigned int type, group;
int flags = 0;
@@ -3136,15 +3130,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -3161,23 +3151,27 @@ errout:
return 0;
}
#endif
-static int ctnetlink_exp_done(struct netlink_callback *cb)
+
+static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
{
- if (cb->args[1])
- nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
- return 0;
+ unsigned long id = (unsigned long)exp;
+
+ id += nf_ct_get_id(exp->master);
+ id += exp->class;
+
+ return id ? id : 1;
}
static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
@@ -3189,7 +3183,7 @@ restart:
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -3198,9 +3192,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -3211,32 +3203,30 @@ restart:
}
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
static int
ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
struct nf_conn *ct = cb->data;
struct nf_conn_help *help = nfct_help(ct);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
if (cb->args[0])
return 0;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
+
restart:
hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -3244,9 +3234,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -3257,9 +3245,6 @@ restart:
cb->args[0] = 1;
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
@@ -3278,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
struct nf_conntrack_zone zone;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
- .done = ctnetlink_exp_done,
};
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
@@ -3309,29 +3293,27 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
return err;
}
-static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct sk_buff *skb2;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
if (cda[CTA_EXPECT_MASTER])
- return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda,
- extack);
+ return ctnetlink_dump_exp_ct(info->net, info->sk, skb,
+ info->nlh, cda,
+ info->extack);
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
- .done = ctnetlink_exp_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -3351,7 +3333,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
@@ -3364,42 +3346,39 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
}
}
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_expect_put(exp);
- goto out;
+ return -ENOMEM;
}
rcu_read_lock();
err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+ info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ exp);
rcu_read_unlock();
nf_ct_expect_put(exp);
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
{
+ struct nf_conntrack_helper *helper;
const struct nf_conn_help *m_help;
const char *name = data;
m_help = nfct_help(exp->master);
- return strcmp(m_help->helper->name, name) == 0;
+ helper = rcu_dereference(m_help->helper);
+ if (!helper)
+ return false;
+
+ return strcmp(helper->name, name) == 0;
}
static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
@@ -3407,15 +3386,13 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
return true;
}
-static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3431,13 +3408,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
return err;
/* bump usage count to 2 */
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
- if (ntohl(id) != (u32)(unsigned long)exp) {
+
+ if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp);
return -ENOENT;
}
@@ -3445,9 +3423,9 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
/* after list removal, usage count == 1 */
spin_lock_bh(&nf_conntrack_expect_lock);
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_expect_put(exp);
}
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3457,14 +3435,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
} else if (cda[CTA_EXPECT_HELP_NAME]) {
char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
- nf_ct_expect_iterate_net(net, expect_iter_name, name,
+ nf_ct_expect_iterate_net(info->net, expect_iter_name, name,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
} else {
/* This basically means we have to flush everything*/
- nf_ct_expect_iterate_net(net, expect_iter_all, NULL,
+ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return 0;
@@ -3474,7 +3452,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
if (cda[CTA_EXPECT_TIMEOUT]) {
- if (!del_timer(&x->timeout))
+ if (!timer_delete(&x->timeout))
return -ETIME;
x->timeout.expires = jiffies +
@@ -3484,10 +3462,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
return 0;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
[CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
[CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
};
+#endif
static int
ctnetlink_parse_expect_nat(const struct nlattr *attr,
@@ -3660,15 +3640,13 @@ err_ct:
return err;
}
-static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3687,20 +3665,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
return err;
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = __nf_ct_expect_find(net, &zone, &tuple);
+ exp = __nf_ct_expect_find(info->net, &zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
- err = ctnetlink_create_expect(net, &zone, cda, u3,
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
+ err = ctnetlink_create_expect(info->net, &zone, cda, u3,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return err;
}
err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3712,20 +3690,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_EXP_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
@@ -3766,17 +3739,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -3784,44 +3755,77 @@ static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
- .fcn = ctnetlink_conntrack_event,
-};
-
-static struct nf_exp_event_notifier ctnl_notifier_exp = {
- .fcn = ctnetlink_expect_event,
+ .ct_event = ctnetlink_conntrack_event,
+ .exp_event = ctnetlink_expect_event,
};
#endif
static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
- [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
- [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
- [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
- [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
+ [IPCTNL_MSG_CT_NEW] = {
+ .call = ctnetlink_new_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_DELETE] = {
+ .call = ctnetlink_del_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_ct_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_STATS] = {
+ .call = ctnetlink_stat_ct,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_DYING] = {
+ .call = ctnetlink_get_ct_dying,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = {
+ .call = ctnetlink_get_ct_unconfirmed,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
- [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
+ [IPCTNL_MSG_EXP_GET] = {
+ .call = ctnetlink_get_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_NEW] = {
+ .call = ctnetlink_new_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_DELETE] = {
+ .call = ctnetlink_del_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_exp_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnetlink_subsystem ctnl_subsys = {
@@ -3845,58 +3849,29 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
static int __net_init ctnetlink_net_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- int ret;
-
- ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot register notifier.\n");
- goto err_out;
- }
-
- ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
+ nf_conntrack_register_notifier(net, &ctnl_notifier);
#endif
return 0;
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
-err_out:
- return ret;
-#endif
}
-static void ctnetlink_net_exit(struct net *net)
+static void ctnetlink_net_pre_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+ nf_conntrack_unregister_notifier(net);
#endif
}
-static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
-{
- struct net *net;
-
- list_for_each_entry(net, net_exit_list, exit_list)
- ctnetlink_net_exit(net);
-
- /* wait for other cpus until they are done with ctnl_notifiers */
- synchronize_rcu();
-}
-
static struct pernet_operations ctnetlink_net_ops = {
.init = ctnetlink_net_init,
- .exit_batch = ctnetlink_net_exit_batch,
+ .pre_exit = ctnetlink_net_pre_exit,
};
static int __init ctnetlink_init(void)
{
int ret;
+ NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx);
+
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c
new file mode 100644
index 000000000000..068e9489e1c2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ovs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
+#include <net/ip.h>
+#include <linux/netfilter_ipv6.h>
+
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = skb_ip_totlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ len = ntohs(ipv6_hdr(skb)->payload_len);
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) {
+ int err = nf_ip6_check_hbh_len(skb, &len);
+
+ if (err)
+ return err;
+ }
+ len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ return pskb_trim_rcsum(skb, len);
+}
+EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u16 zone, u8 family, u8 *proto, u16 *mru)
+{
+ int err;
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err)
+ return err;
+
+ *mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ } else if (family == NFPROTO_IPV6) {
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
+ return err;
+ }
+
+ *proto = ipv6_hdr(skb)->nexthdr;
+ *mru = IP6CB(skb)->frag_max_size;
+#endif
+ } else {
+ kfree_skb(skb);
+ return -EPFNOSUPPORT;
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 1f44d523b512..4c679638df06 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Connection tracking support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
* It is a specification defined by Microsoft and some vendors
* working with Microsoft. PPTP is built on top of a modified
* version of the Internet Generic Routing Encapsulation Protocol.
@@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
static DEFINE_SPINLOCK(nf_pptp_lock);
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
- struct nf_conntrack_expect *expect_reply)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
- struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
/* PptpControlMessageType names */
@@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
static void pptp_expectfn(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ const struct nf_nat_pptp_hook *hook;
struct net *net = nf_ct_net(ct);
- typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
pr_debug("increasing timeouts\n");
/* increase timeout of GRE data channel conntrack entry */
@@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct,
/* Can you see how rusty this code is, compared with the pre-2.6.11
* one? That's what happened to my shiny newnat of 2002 ;( -HW */
- nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
- if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
- nf_nat_pptp_expectfn(ct, exp);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->master->status & IPS_NAT_MASK)
+ hook->expectfn(ct, exp);
else {
struct nf_conntrack_tuple inv_t;
struct nf_conntrack_expect *exp_other;
@@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
{
struct nf_conntrack_expect *exp_orig, *exp_reply;
+ const struct nf_nat_pptp_hook *hook;
enum ip_conntrack_dir dir;
int ret = 1;
- typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
exp_orig = nf_ct_expect_alloc(ct);
if (exp_orig == NULL)
@@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
IPPROTO_GRE, &callid, &peer_callid);
exp_reply->expectfn = pptp_expectfn;
- nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
- if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
- nf_nat_pptp_exp_gre(exp_orig, exp_reply);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ hook->exp_gre(exp_orig, exp_reply);
if (nf_ct_expect_related(exp_orig, 0) != 0)
goto out_put_both;
if (nf_ct_expect_related(exp_reply, 0) != 0)
@@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
msg = ntohs(ctlh->messageType);
pr_debug("inbound control message %s\n", pptp_msg_name(msg));
@@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
- if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
msg = ntohs(ctlh->messageType);
pr_debug("outbound control message %s\n", pptp_msg_name(msg));
@@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
- if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
@@ -544,7 +520,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
nexthdr_off = protoff;
tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
- BUG_ON(!tcph);
+ if (!tcph)
+ return NF_ACCEPT;
+
nexthdr_off += tcph->doff * 4;
datalen = tcplen - tcph->doff * 4;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index a0560d175a7f..bc1d96686b9c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -42,17 +42,16 @@
#include <net/ipv6.h>
#include <net/inet_frag.h>
-extern unsigned int nf_conntrack_net_id;
-
static DEFINE_MUTEX(nf_ct_proto_mutex);
#ifdef CONFIG_SYSCTL
-__printf(5, 6)
+__printf(4, 5)
void nf_l4proto_log_invalid(const struct sk_buff *skb,
- struct net *net,
- u16 pf, u8 protonum,
+ const struct nf_hook_state *state,
+ u8 protonum,
const char *fmt, ...)
{
+ struct net *net = state->net;
struct va_format vaf;
va_list args;
@@ -64,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_proto_%d: %pV ", protonum, &vaf);
+ nf_log_packet(net, state->pf, 0, skb, state->in, state->out,
+ NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf);
va_end(args);
}
EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);
-__printf(3, 4)
+__printf(4, 5)
void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
const struct nf_conn *ct,
+ const struct nf_hook_state *state,
const char *fmt, ...)
{
struct va_format vaf;
@@ -87,7 +87,7 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct),
+ nf_l4proto_log_invalid(skb, state,
nf_ct_protonum(ct), "%pV", &vaf);
va_end(args);
}
@@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
case IPPROTO_UDP: return &nf_conntrack_l4proto_udp;
case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp;
case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp;
#endif
@@ -121,17 +118,64 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
};
EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
+unsigned int nf_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
const struct nf_conn_help *help;
+ enum ip_conntrack_info ctinfo;
+ unsigned int protoff;
+ struct nf_conn *ct;
+ bool seqadj_needed;
+ __be16 frag_off;
+ int start;
+ u8 pnum;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || in_vrf_postrouting(state))
+ return NF_ACCEPT;
help = nfct_help(ct);
+
+ seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
+ if (!help && !seqadj_needed)
+ return nf_conntrack_confirm(skb);
+
+ /* helper->help() do not expect ICMP packets */
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return nf_conntrack_confirm(skb);
+
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ pnum = ipv6_hdr(skb)->nexthdr;
+ start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
+ if (start < 0 || (frag_off & htons(~0x7)) != 0)
+ return nf_conntrack_confirm(skb);
+
+ protoff = start;
+ break;
+ default:
+ return nf_conntrack_confirm(skb);
+ }
+
if (help) {
const struct nf_conntrack_helper *helper;
int ret;
- /* rcu_read_lock()ed by nf_hook_thresh */
+ /* rcu_read_lock()ed by nf_hook */
helper = rcu_dereference(help->helper);
if (helper) {
ret = helper->help(skb,
@@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
}
}
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
+ if (seqadj_needed &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
}
/* We've seen it coming out the other side: confirm it */
@@ -155,22 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
}
EXPORT_SYMBOL_GPL(nf_confirm);
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return nf_conntrack_confirm(skb);
-
- return nf_confirm(skb,
- skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
-
static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -217,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -255,16 +281,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
/* We only do TCP and SCTP at the moment: is there a better way? */
if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ tuple.dst.protonum != IPPROTO_SCTP)
return -ENOPROTOOPT;
- }
- if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
+ if ((unsigned int)*len < sizeof(struct sockaddr_in))
return -EINVAL;
- }
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (h) {
@@ -278,17 +299,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
.tuple.dst.u3.ip;
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
nf_ct_put(ct);
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
return -EFAULT;
else
return 0;
}
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
return -ENOENT;
}
@@ -331,12 +347,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (!h) {
- pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
- &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ if (!h)
return -ENOENT;
- }
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -360,30 +372,6 @@ static struct nf_sockopt_ops so_getorigdst6 = {
.owner = THIS_MODULE,
};
-static unsigned int ipv6_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned char pnum = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int protoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return nf_conntrack_confirm(skb);
-
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return nf_conntrack_confirm(skb);
- }
-
- return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
static unsigned int ipv6_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -412,13 +400,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = {
.priority = NF_IP6_PRI_CONNTRACK,
},
{
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_LAST,
},
{
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_LAST - 1,
@@ -446,7 +434,7 @@ static struct nf_ct_bridge_info *nf_ct_bridge_info;
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
bool fixup_needed = false, retry = true;
int err = 0;
retry:
@@ -522,29 +510,37 @@ retry:
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
- if (fixup_needed)
- nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
- (void *)(unsigned long)nfproto, 0, 0);
+ if (fixup_needed) {
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = (void *)(unsigned long)nfproto,
+ };
+ nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
+ }
return err;
}
static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
case NFPROTO_IPV4:
- if (cnet->users4 && (--cnet->users4 == 0))
+ if (cnet->users4 && (--cnet->users4 == 0)) {
nf_unregister_net_hooks(net, ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_defrag_ipv4_disable(net);
+ }
break;
#if IS_ENABLED(CONFIG_IPV6)
case NFPROTO_IPV6:
- if (cnet->users6 && (--cnet->users6 == 0))
+ if (cnet->users6 && (--cnet->users6 == 0)) {
nf_unregister_net_hooks(net, ipv6_conntrack_ops,
ARRAY_SIZE(ipv6_conntrack_ops));
+ nf_defrag_ipv6_disable(net);
+ }
break;
#endif
case NFPROTO_BRIDGE:
@@ -565,6 +561,7 @@ static int nf_ct_netns_inet_get(struct net *net)
int err;
err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+#if IS_ENABLED(CONFIG_IPV6)
if (err < 0)
goto err1;
err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
@@ -575,6 +572,7 @@ static int nf_ct_netns_inet_get(struct net *net)
err2:
nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
+#endif
return err;
}
@@ -610,7 +608,7 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto)
switch (nfproto) {
case NFPROTO_BRIDGE:
nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
- /* fall through */
+ fallthrough;
case NFPROTO_INET:
nf_ct_netns_do_put(net, NFPROTO_IPV4);
nf_ct_netns_do_put(net, NFPROTO_IPV6);
@@ -658,7 +656,7 @@ int nf_conntrack_proto_init(void)
#if IS_ENABLED(CONFIG_IPV6)
cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst6);
+ nf_unregister_sockopt(&so_getorigdst);
#endif
return ret;
}
@@ -680,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#if IS_ENABLED(CONFIG_IPV6)
nf_conntrack_icmpv6_init_net(net);
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- nf_conntrack_dccp_init_net(net);
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
nf_conntrack_sctp_init_net(net);
#endif
@@ -691,13 +686,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
#endif
}
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
- nf_ct_gre_keymap_flush(net);
-#endif
-}
-
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
@@ -705,3 +693,4 @@ MODULE_ALIAS("ip_conntrack");
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
deleted file mode 100644
index b3f4a334f9d7..000000000000
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ /dev/null
@@ -1,770 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * DCCP connection tracking protocol helper
- *
- * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sysctl.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/dccp.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_timeout.h>
-#include <net/netfilter/nf_log.h>
-
-/* Timeouts are based on values from RFC4340:
- *
- * - REQUEST:
- *
- * 8.1.2. Client Request
- *
- * A client MAY give up on its DCCP-Requests after some time
- * (3 minutes, for example).
- *
- * - RESPOND:
- *
- * 8.1.3. Server Response
- *
- * It MAY also leave the RESPOND state for CLOSED after a timeout of
- * not less than 4MSL (8 minutes);
- *
- * - PARTOPEN:
- *
- * 8.1.5. Handshake Completion
- *
- * If the client remains in PARTOPEN for more than 4MSL (8 minutes),
- * it SHOULD reset the connection with Reset Code 2, "Aborted".
- *
- * - OPEN:
- *
- * The DCCP timestamp overflows after 11.9 hours. If the connection
- * stays idle this long the sequence number won't be recognized
- * as valid anymore.
- *
- * - CLOSEREQ/CLOSING:
- *
- * 8.3. Termination
- *
- * The retransmission timer should initially be set to go off in two
- * round-trip times and should back off to not less than once every
- * 64 seconds ...
- *
- * - TIMEWAIT:
- *
- * 4.3. States
- *
- * A server or client socket remains in this state for 2MSL (4 minutes)
- * after the connection has been town down, ...
- */
-
-#define DCCP_MSL (2 * 60 * HZ)
-
-static const char * const dccp_state_names[] = {
- [CT_DCCP_NONE] = "NONE",
- [CT_DCCP_REQUEST] = "REQUEST",
- [CT_DCCP_RESPOND] = "RESPOND",
- [CT_DCCP_PARTOPEN] = "PARTOPEN",
- [CT_DCCP_OPEN] = "OPEN",
- [CT_DCCP_CLOSEREQ] = "CLOSEREQ",
- [CT_DCCP_CLOSING] = "CLOSING",
- [CT_DCCP_TIMEWAIT] = "TIMEWAIT",
- [CT_DCCP_IGNORE] = "IGNORE",
- [CT_DCCP_INVALID] = "INVALID",
-};
-
-#define sNO CT_DCCP_NONE
-#define sRQ CT_DCCP_REQUEST
-#define sRS CT_DCCP_RESPOND
-#define sPO CT_DCCP_PARTOPEN
-#define sOP CT_DCCP_OPEN
-#define sCR CT_DCCP_CLOSEREQ
-#define sCG CT_DCCP_CLOSING
-#define sTW CT_DCCP_TIMEWAIT
-#define sIG CT_DCCP_IGNORE
-#define sIV CT_DCCP_INVALID
-
-/*
- * DCCP state transition table
- *
- * The assumption is the same as for TCP tracking:
- *
- * We are the man in the middle. All the packets go through us but might
- * get lost in transit to the destination. It is assumed that the destination
- * can't receive segments we haven't seen.
- *
- * The following states exist:
- *
- * NONE: Initial state, expecting Request
- * REQUEST: Request seen, waiting for Response from server
- * RESPOND: Response from server seen, waiting for Ack from client
- * PARTOPEN: Ack after Response seen, waiting for packet other than Response,
- * Reset or Sync from server
- * OPEN: Packet other than Response, Reset or Sync seen
- * CLOSEREQ: CloseReq from server seen, expecting Close from client
- * CLOSING: Close seen, expecting Reset
- * TIMEWAIT: Reset seen
- * IGNORE: Not determinable whether packet is valid
- *
- * Some states exist only on one side of the connection: REQUEST, RESPOND,
- * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
- * the one it was in before.
- *
- * Packets are marked as ignored (sIG) if we don't know if they're valid
- * (for example a reincarnation of a connection we didn't notice is dead
- * already) and the server may send back a connection closing Reset or a
- * Response. They're also used for Sync/SyncAck packets, which we don't
- * care about.
- */
-static const u_int8_t
-dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
- [CT_DCCP_ROLE_CLIENT] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sRQ Regular Request
- * sRQ -> sRQ Retransmitted Request or reincarnation
- * sRS -> sRS Retransmitted Request (apparently Response
- * got lost after we saw it) or reincarnation
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation
- *
- * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
- sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, might be response to ignored Request
- * sRS -> sIG Ignore, might be response to ignored Request
- * sPO -> sIG Ignore, might be response to ignored Request
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, reincarnation in reverse direction
- * goes through sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN
- * sOP -> sOP Regular ACK, remain in OPEN
- * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet
- * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Remain in PARTOPEN state
- * sOP -> sOP Regular DataAck packet in OPEN state
- * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * CLOSEREQ may only be sent by the server.
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sCG Client-initiated close
- * sOP -> sCG Client-initiated close
- * sCR -> sCG Close in response to CloseReq (8.3.)
- * sCG -> sCG Retransmit
- * sTW -> sIV Late retransmit, already in TIME_WAIT
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.)
- * sRS -> sTW Response received without Request
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.)
- * sOP -> sTW Connection reset
- * sCR -> sTW Connection reset
- * sCG -> sTW Connection reset
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
- [CT_DCCP_ROLE_SERVER] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, conntrack might be out of sync
- * sRS -> sIG Ignore, conntrack might be out of sync
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation, must reverse roles
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Response without Request
- * sRQ -> sRS Response to clients Request
- * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT)
- * sPO -> sIG Response to an ignored Request or late retransmit
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, Request from client in sTW moves to sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Ack in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular DataAck in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.)
- * sOP -> sCR CloseReq in OPEN state
- * sCR -> sCR Retransmit
- * sCG -> sCR Simultaneous close, client sends another Close
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCG Move direcly to CLOSING
- * sOP -> sCG Move to CLOSING
- * sCR -> sIV Close after CloseReq is invalid
- * sCG -> sCG Retransmit
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Reset in response to Request
- * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sOP -> sTW
- * sCR -> sTW
- * sCG -> sTW
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
-};
-
-static noinline bool
-dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- const struct dccp_hdr *dh)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_dccp_net *dn;
- const char *msg;
- u_int8_t state;
-
- state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
- switch (state) {
- default:
- dn = nf_dccp_pernet(net);
- if (dn->dccp_loose == 0) {
- msg = "not picking up existing connection ";
- goto out_invalid;
- }
- case CT_DCCP_REQUEST:
- break;
- case CT_DCCP_INVALID:
- msg = "invalid state transition ";
- goto out_invalid;
- }
-
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.state = CT_DCCP_NONE;
- ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
- ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
- ct->proto.dccp.handshake_seq = 0;
- return true;
-
-out_invalid:
- nf_ct_l4proto_log_invalid(skb, ct, "%s", msg);
- return false;
-}
-
-static u64 dccp_ack_seq(const struct dccp_hdr *dh)
-{
- const struct dccp_hdr_ack_bits *dhack;
-
- dhack = (void *)dh + __dccp_basic_hdr_len(dh);
- return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
- ntohl(dhack->dccph_ack_nr_low);
-}
-
-static bool dccp_error(const struct dccp_hdr *dh,
- struct sk_buff *skb, unsigned int dataoff,
- const struct nf_hook_state *state)
-{
- unsigned int dccp_len = skb->len - dataoff;
- unsigned int cscov;
- const char *msg;
-
- if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
- dh->dccph_doff * 4 > dccp_len) {
- msg = "nf_ct_dccp: truncated/malformed packet ";
- goto out_invalid;
- }
-
- cscov = dccp_len;
- if (dh->dccph_cscov) {
- cscov = (dh->dccph_cscov - 1) * 4;
- if (cscov > dccp_len) {
- msg = "nf_ct_dccp: bad checksum coverage ";
- goto out_invalid;
- }
- }
-
- if (state->hook == NF_INET_PRE_ROUTING &&
- state->net->ct.sysctl_checksum &&
- nf_checksum_partial(skb, state->hook, dataoff, cscov,
- IPPROTO_DCCP, state->pf)) {
- msg = "nf_ct_dccp: bad checksum ";
- goto out_invalid;
- }
-
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- msg = "nf_ct_dccp: reserved packet type ";
- goto out_invalid;
- }
- return false;
-out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_DCCP, "%s", msg);
- return true;
-}
-
-int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- struct dccp_hdr _dh, *dh;
- u_int8_t type, old_state, new_state;
- enum ct_dccp_roles role;
- unsigned int *timeouts;
-
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- if (!dh)
- return NF_DROP;
-
- if (dccp_error(dh, skb, dataoff, state))
- return -NF_ACCEPT;
-
- type = dh->dccph_type;
- if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
- return -NF_ACCEPT;
-
- if (type == DCCP_PKT_RESET &&
- !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- /* Tear down connection immediately if only reply is a RESET */
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_ACCEPT;
- }
-
- spin_lock_bh(&ct->lock);
-
- role = ct->proto.dccp.role[dir];
- old_state = ct->proto.dccp.state;
- new_state = dccp_state_table[role][type][old_state];
-
- switch (new_state) {
- case CT_DCCP_REQUEST:
- if (old_state == CT_DCCP_TIMEWAIT &&
- role == CT_DCCP_ROLE_SERVER) {
- /* Reincarnation in the reverse direction: reopen and
- * reverse client/server roles. */
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
- }
- break;
- case CT_DCCP_RESPOND:
- if (old_state == CT_DCCP_REQUEST)
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- break;
- case CT_DCCP_PARTOPEN:
- if (old_state == CT_DCCP_RESPOND &&
- type == DCCP_PKT_ACK &&
- dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
- set_bit(IPS_ASSURED_BIT, &ct->status);
- break;
- case CT_DCCP_IGNORE:
- /*
- * Connection tracking might be out of sync, so we ignore
- * packets that might establish a new connection and resync
- * if the server responds with a valid Response.
- */
- if (ct->proto.dccp.last_dir == !dir &&
- ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
- type == DCCP_PKT_RESPONSE) {
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- new_state = CT_DCCP_RESPOND;
- break;
- }
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
-
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet");
- return NF_ACCEPT;
- case CT_DCCP_INVALID:
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition");
- return -NF_ACCEPT;
- }
-
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
- ct->proto.dccp.state = new_state;
- spin_unlock_bh(&ct->lock);
-
- if (new_state != old_state)
- nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
-
- timeouts = nf_ct_timeout_lookup(ct);
- if (!timeouts)
- timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout;
- nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
-
- return NF_ACCEPT;
-}
-
-static bool dccp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.dccp.state) {
- case CT_DCCP_CLOSEREQ:
- case CT_DCCP_CLOSING:
- case CT_DCCP_TIMEWAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
-{
- seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
-{
- struct nlattr *nest_parms;
-
- spin_lock_bh(&ct->lock);
- nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP);
- if (!nest_parms)
- goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
- nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq),
- CTA_PROTOINFO_DCCP_PAD))
- goto nla_put_failure;
- nla_nest_end(skb, nest_parms);
- spin_unlock_bh(&ct->lock);
- return 0;
-
-nla_put_failure:
- spin_unlock_bh(&ct->lock);
- return -1;
-}
-
-static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
- [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
- [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC },
-};
-
-#define DCCP_NLATTR_SIZE ( \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \
- NLA_ALIGN(NLA_HDRLEN + 0))
-
-static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
-{
- struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
- struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
- int err;
-
- if (!attr)
- return 0;
-
- err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr,
- dccp_nla_policy, NULL);
- if (err < 0)
- return err;
-
- if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
- !tb[CTA_PROTOINFO_DCCP_ROLE] ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
- return -EINVAL;
- }
-
- spin_lock_bh(&ct->lock);
- ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
- if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- } else {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
- }
- if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
- ct->proto.dccp.handshake_seq =
- be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
- }
- spin_unlock_bh(&ct->lock);
- return 0;
-}
-#endif
-
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
- struct net *net, void *data)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
- unsigned int *timeouts = data;
- int i;
-
- if (!timeouts)
- timeouts = dn->dccp_timeout;
-
- /* set default DCCP timeouts. */
- for (i=0; i<CT_DCCP_MAX; i++)
- timeouts[i] = dn->dccp_timeout[i];
-
- /* there's a 1:1 mapping between attributes and protocol states. */
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (tb[i]) {
- timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
- }
- }
-
- timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST];
- return 0;
-}
-
-static int
-dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
- const unsigned int *timeouts = data;
- int i;
-
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
- goto nla_put_failure;
- }
- return 0;
-
-nla_put_failure:
- return -ENOSPC;
-}
-
-static const struct nla_policy
-dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
- [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-
-void nf_conntrack_dccp_init_net(struct net *net)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
- /* default values */
- dn->dccp_loose = 1;
- dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
-
- /* timeouts[0] is unused, make it same as SYN_SENT so
- * ->timeouts[0] contains 'new' timeout, like udp or icmp.
- */
- dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST];
-}
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
- .l4proto = IPPROTO_DCCP,
- .can_early_drop = dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = DCCP_NLATTR_SIZE,
- .to_nlattr = dccp_to_nlattr,
- .from_nlattr = nlattr_to_dccp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- .ctnl_timeout = {
- .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
- .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
- .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
- .nla_policy = dccp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 5b05487a60d2..af369e686fc5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
return &net->ct.nf_ct_proto.gre;
}
-void nf_ct_gre_keymap_flush(struct net *net)
-{
- struct nf_gre_net *net_gre = gre_pernet(net);
- struct nf_ct_gre_keymap *km, *tmp;
-
- spin_lock_bh(&keymap_lock);
- list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
- list_del_rcu(&km->list);
- kfree_rcu(km, rcu);
- }
- spin_unlock_bh(&keymap_lock);
-}
-
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
const struct nf_conntrack_tuple *t)
{
@@ -218,8 +205,7 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state)
{
- if (state->pf != NFPROTO_IPV4)
- return -NF_ACCEPT;
+ unsigned long status;
if (!nf_ct_is_confirmed(ct)) {
unsigned int *timeouts = nf_ct_timeout_lookup(ct);
@@ -233,11 +219,17 @@ int nf_conntrack_gre_packet(struct nf_conn *ct,
ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
}
+ status = READ_ONCE(ct->status);
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
- if (ct->status & IPS_SEEN_REPLY) {
+ if (status & IPS_SEEN_REPLY) {
nf_ct_refresh_acct(ct, ctinfo, skb,
ct->proto.gre.stream_timeout);
+
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe. */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
@@ -312,6 +304,7 @@ void nf_conntrack_gre_init_net(struct net *net)
/* protocol helper struct */
const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = {
.l4proto = IPPROTO_GRE,
+ .allow_clash = true,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = gre_print_conntrack,
#endif
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 4efd8741c105..b38b7164acd5 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -170,12 +170,12 @@ int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
ct_daddr = &ct->tuplehash[dir].tuple.dst.u3;
if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) {
if (state->pf == AF_INET) {
- nf_l4proto_log_invalid(skb, state->net, state->pf,
+ nf_l4proto_log_invalid(skb, state,
l4proto,
"outer daddr %pI4 != inner %pI4",
&outer_daddr->ip, &ct_daddr->ip);
} else if (state->pf == AF_INET6) {
- nf_l4proto_log_invalid(skb, state->net, state->pf,
+ nf_l4proto_log_invalid(skb, state,
l4proto,
"outer daddr %pI6 != inner %pI6",
&outer_daddr->ip6, &ct_daddr->ip6);
@@ -197,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg);
}
/* Small and modified version of icmp_rcv */
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index facd8c64ec4e..327b8059025d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = {
[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
- [ICMPV6_MLD2_REPORT - 130] = 1
+ [ICMPV6_MLD2_REPORT - 130] = 1,
+ [ICMPV6_MRDISC_ADV - 130] = 1,
+ [ICMPV6_MRDISC_SOL - 130] = 1
};
bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
@@ -126,8 +128,57 @@ static void icmpv6_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMPV6, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg);
+}
+
+static noinline_for_stack int
+nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
+{
+ u8 hl = ipv6_hdr(skb)->hop_limit;
+ union nf_inet_addr outer_daddr;
+ union {
+ struct nd_opt_hdr nd_opt;
+ struct rd_msg rd_msg;
+ } tmp;
+ const struct nd_opt_hdr *nd_opt;
+ const struct rd_msg *rd_msg;
+
+ rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg);
+ if (!rd_msg) {
+ icmpv6_error_log(skb, state, "short redirect");
+ return -NF_ACCEPT;
+ }
+
+ if (rd_msg->icmph.icmp6_code != 0)
+ return NF_ACCEPT;
+
+ if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect");
+ return -NF_ACCEPT;
+ }
+
+ dataoff += sizeof(*rd_msg);
+
+ /* warning: rd_msg no longer usable after this call */
+ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt);
+ if (!nd_opt || nd_opt->nd_opt_len == 0) {
+ icmpv6_error_log(skb, state, "redirect without options");
+ return -NF_ACCEPT;
+ }
+
+ /* We could call ndisc_parse_options(), but it would need
+ * skb_linearize() and a bit more work.
+ */
+ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR)
+ return NF_ACCEPT;
+
+ memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+ sizeof(outer_daddr.ip6));
+ dataoff += 8;
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMPV6, &outer_daddr);
}
int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
@@ -160,6 +211,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
return NF_ACCEPT;
}
+ if (icmp6h->icmp6_type == NDISC_REDIRECT)
+ return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state);
+
/* is not error message ? */
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 4f897b14b606..7c6f7c9f7332 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -27,41 +27,31 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR
-
- And so for me for SCTP :D -Kiran */
-
static const char *const sctp_conntrack_names[] = {
- "NONE",
- "CLOSED",
- "COOKIE_WAIT",
- "COOKIE_ECHOED",
- "ESTABLISHED",
- "SHUTDOWN_SENT",
- "SHUTDOWN_RECD",
- "SHUTDOWN_ACK_SENT",
- "HEARTBEAT_SENT",
- "HEARTBEAT_ACKED",
+ [SCTP_CONNTRACK_NONE] = "NONE",
+ [SCTP_CONNTRACK_CLOSED] = "CLOSED",
+ [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT",
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED",
+ [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED",
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT",
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD",
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT",
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT",
};
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
- [SCTP_CONNTRACK_CLOSED] = 10 SECS,
- [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
- [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
- [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS,
- [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
+ [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10),
+ [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210),
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30),
};
+#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
+
#define sNO SCTP_CONNTRACK_NONE
#define sCL SCTP_CONNTRACK_CLOSED
#define sCW SCTP_CONNTRACK_COOKIE_WAIT
@@ -71,7 +61,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
-#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -88,15 +77,12 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als
COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction.
SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
- that of the HEARTBEAT chunk. Secondary connection is
- established.
*/
/* TODO
@@ -113,33 +99,33 @@ cookie echoed to closed.
static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
-/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
-/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
+/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES},
}
};
@@ -151,6 +137,7 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
}
#endif
+/* do_basic_checks ensures sch->length > 0, do not use before */
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
(offset) < (skb)->len && \
@@ -161,7 +148,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
static int do_basic_checks(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- unsigned long *map)
+ unsigned long *map,
+ const struct nf_hook_state *state)
{
u_int32_t offset, count;
struct sctp_chunkhdr _sch, *sch;
@@ -170,8 +158,6 @@ static int do_basic_checks(struct nf_conn *ct,
flag = 0;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- pr_debug("Chunk Num: %d Type: %d\n", count, sch->type);
-
if (sch->type == SCTP_CID_INIT ||
sch->type == SCTP_CID_INIT_ACK ||
sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
@@ -186,7 +172,9 @@ static int do_basic_checks(struct nf_conn *ct,
sch->type == SCTP_CID_COOKIE_ECHO ||
flag) &&
count != 0) || !sch->length) {
- pr_debug("Basic checks failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "%s failed. chunk num %d, type %d, len %d flag %d\n",
+ __func__, count, sch->type, sch->length, flag);
return 1;
}
@@ -194,7 +182,6 @@ static int do_basic_checks(struct nf_conn *ct,
set_bit(sch->type, map);
}
- pr_debug("Basic checks passed\n");
return count == 0;
}
@@ -204,64 +191,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
{
int i;
- pr_debug("Chunk type: %d\n", chunk_type);
-
switch (chunk_type) {
case SCTP_CID_INIT:
- pr_debug("SCTP_CID_INIT\n");
i = 0;
break;
case SCTP_CID_INIT_ACK:
- pr_debug("SCTP_CID_INIT_ACK\n");
i = 1;
break;
case SCTP_CID_ABORT:
- pr_debug("SCTP_CID_ABORT\n");
i = 2;
break;
case SCTP_CID_SHUTDOWN:
- pr_debug("SCTP_CID_SHUTDOWN\n");
i = 3;
break;
case SCTP_CID_SHUTDOWN_ACK:
- pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
i = 4;
break;
case SCTP_CID_ERROR:
- pr_debug("SCTP_CID_ERROR\n");
i = 5;
break;
case SCTP_CID_COOKIE_ECHO:
- pr_debug("SCTP_CID_COOKIE_ECHO\n");
i = 6;
break;
case SCTP_CID_COOKIE_ACK:
- pr_debug("SCTP_CID_COOKIE_ACK\n");
i = 7;
break;
case SCTP_CID_SHUTDOWN_COMPLETE:
- pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
case SCTP_CID_HEARTBEAT:
- pr_debug("SCTP_CID_HEARTBEAT");
i = 9;
break;
case SCTP_CID_HEARTBEAT_ACK:
- pr_debug("SCTP_CID_HEARTBEAT_ACK");
i = 10;
break;
default:
/* Other chunks like DATA or SACK do not change the state */
- pr_debug("Unknown chunk type, Will stay in %s\n",
- sctp_conntrack_names[cur_state]);
+ pr_debug("Unknown chunk type %d, Will stay in %s\n",
+ chunk_type, sctp_conntrack_names[cur_state]);
return cur_state;
}
- pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
- dir, sctp_conntrack_names[cur_state], chunk_type,
- sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
-
return sctp_conntracks[dir][i][cur_state];
}
@@ -308,7 +278,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
pr_debug("Setting vtag %x for secondary conntrack\n",
sh->vtag);
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
- } else {
+ } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) {
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
pr_debug("Setting vtag %x for new conn OOTB\n",
@@ -349,7 +319,7 @@ static bool sctp_error(struct sk_buff *skb,
}
return false;
out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg);
return true;
}
@@ -369,6 +339,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
u_int32_t offset, count;
unsigned int *timeouts;
unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+ bool ignore = false;
if (sctp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -377,7 +348,7 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
if (sh == NULL)
goto out;
- if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ if (do_basic_checks(ct, skb, dataoff, map, state) != 0)
goto out;
if (!nf_ct_is_confirmed(ct)) {
@@ -400,7 +371,9 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_HEARTBEAT, map) &&
!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "verification tag check failed %x vs %x for dir %d",
+ sh->vtag, ct->proto.sctp.vtag[dir], dir);
goto out;
}
@@ -409,33 +382,67 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Special cases of Verification tag check (Sec 8.5.1) */
if (sch->type == SCTP_CID_INIT) {
- /* Sec 8.5.1 (A) */
+ /* (A) vtag MUST be zero */
if (sh->vtag != 0)
goto out_unlock;
} else if (sch->type == SCTP_CID_ABORT) {
- /* Sec 8.5.1 (B) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir])
+ /* (B) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
- /* Sec 8.5.1 (C) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir] &&
- sch->flags & SCTP_CHUNK_FLAG_T)
+ /* (C) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
- /* Sec 8.5.1 (D) */
+ /* (D) vtag must be same as init_vtag as found in INIT_ACK */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
- } else if (sch->type == SCTP_CID_HEARTBEAT ||
- sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ } else if (sch->type == SCTP_CID_COOKIE_ACK) {
+ ct->proto.sctp.init[dir] = 0;
+ ct->proto.sctp.init[!dir] = 0;
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.last_dir = dir;
+ ignore = true;
+ continue;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ }
+ } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
if (ct->proto.sctp.vtag[dir] == 0) {
pr_debug("Setting vtag %x for dir %d\n",
sh->vtag, dir);
ct->proto.sctp.vtag[dir] = sh->vtag;
} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
- goto out_unlock;
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
+ ct->proto.sctp.last_dir == dir)
+ goto out_unlock;
+
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ ct->proto.sctp.vtag[!dir] = 0;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
}
}
@@ -444,46 +451,76 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct,
/* Invalid */
if (new_state == SCTP_CONNTRACK_MAX) {
- pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
- "conntrack=%u\n",
- dir, sch->type, old_state);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "Invalid, old_state %d, dir %d, type %d",
+ old_state, dir, sch->type);
+
goto out_unlock;
}
/* If it is an INIT or an INIT ACK note down the vtag */
- if (sch->type == SCTP_CID_INIT ||
- sch->type == SCTP_CID_INIT_ACK) {
- struct sctp_inithdr _inithdr, *ih;
+ if (sch->type == SCTP_CID_INIT) {
+ struct sctp_inithdr _ih, *ih;
- ih = skb_header_pointer(skb, offset + sizeof(_sch),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL)
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
goto out_unlock;
- pr_debug("Setting vtag %x for dir %d\n",
- ih->init_tag, !dir);
+
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
+ ct->proto.sctp.init[!dir] = 0;
+ ct->proto.sctp.init[dir] = 1;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
+ ct->proto.sctp.vtag[!dir] = ih->init_tag;
+
+ /* don't renew timeout on init retransmit so
+ * port reuse by client or NAT middlebox cannot
+ * keep entry alive indefinitely (incl. nat info).
+ */
+ if (new_state == SCTP_CONNTRACK_CLOSED &&
+ old_state == SCTP_CONNTRACK_CLOSED &&
+ nf_ct_is_confirmed(ct))
+ ignore = true;
+ } else if (sch->type == SCTP_CID_INIT_ACK) {
+ struct sctp_inithdr _ih, *ih;
+ __be32 vtag;
+
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
+ goto out_unlock;
+
+ vtag = ct->proto.sctp.vtag[!dir];
+ if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
+ goto out_unlock;
+ /* collision */
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
+ vtag != ih->init_tag)
+ goto out_unlock;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
ct->proto.sctp.vtag[!dir] = ih->init_tag;
}
ct->proto.sctp.state = new_state;
- if (old_state != new_state)
+ if (old_state != new_state) {
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ if (new_state == SCTP_CONNTRACK_ESTABLISHED &&
+ !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ }
}
spin_unlock_bh(&ct->lock);
+ /* allow but do not refresh timeout */
+ if (ignore)
+ return NF_ACCEPT;
+
timeouts = nf_ct_timeout_lookup(ct);
if (!timeouts)
timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
- if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
- dir == IP_CT_DIR_REPLY &&
- new_state == SCTP_CONNTRACK_ESTABLISHED) {
- pr_debug("Setting assured bit\n");
- set_bit(IPS_ASSURED_BIT, &ct->status);
- nf_conntrack_event_cache(IPCT_ASSURED, ct);
- }
-
return NF_ACCEPT;
out_unlock:
@@ -512,7 +549,7 @@ static bool sctp_can_early_drop(const struct nf_conn *ct)
#include <linux/netfilter/nfnetlink_conntrack.h>
static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
@@ -521,15 +558,20 @@ static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
- nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 1926fd56df56..0c1d086e96cb 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -14,7 +14,7 @@
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
@@ -31,20 +31,6 @@
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-/* "Be conservative in what you do,
- be liberal in what you accept from others."
- If it's non-zero, we mark only out of window RST segments as INVALID. */
-static int nf_ct_tcp_be_liberal __read_mostly = 0;
-
-/* If it is set to zero, we disable picking up already established
- connections. */
-static int nf_ct_tcp_loose __read_mostly = 1;
-
-/* Max number of the retransmitted packets without receiving an (acceptable)
- ACK from the destination. If this number is reached, a shorter timer
- will be started. */
-static int nf_ct_tcp_max_retrans __read_mostly = 3;
-
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR */
@@ -61,6 +47,12 @@ static const char *const tcp_conntrack_names[] = {
"SYN_SENT2",
};
+enum nf_ct_tcp_action {
+ NFCT_TCP_IGNORE,
+ NFCT_TCP_INVALID,
+ NFCT_TCP_ACCEPT,
+};
+
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
@@ -352,10 +344,11 @@ static void tcp_options(const struct sk_buff *skb,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
- state->td_scale =
- state->flags = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
while (length > 0) {
int opcode=*ptr++;
@@ -408,7 +401,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
/* Fast path for timestamp-only option */
if (length == TCPOLEN_TSTAMP_ALIGNED
@@ -458,23 +452,73 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-static bool tcp_in_window(const struct nf_conn *ct,
- struct ip_ct_tcp *state,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- unsigned int dataoff,
- const struct tcphdr *tcph)
+static void tcp_init_sender(struct ip_ct_tcp_state *sender,
+ struct ip_ct_tcp_state *receiver,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u32 end, u32 win,
+ enum ip_conntrack_dir dir)
{
- struct net *net = nf_ct_net(ct);
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ /* SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /* RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (dir == IP_CT_DIR_REPLY &&
+ !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
+ sender->td_scale = 0;
+ receiver->td_scale = 0;
+ }
+}
+
+__printf(6, 7)
+static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb,
+ const struct nf_conn *ct,
+ const struct nf_hook_state *state,
+ const struct ip_ct_tcp_state *sender,
+ enum nf_ct_tcp_action ret,
+ const char *fmt, ...)
+{
+ const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct));
+ struct va_format vaf;
+ va_list args;
+ bool be_liberal;
+
+ be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal;
+ if (be_liberal)
+ return NFCT_TCP_ACCEPT;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf);
+ va_end(args);
+
+ return ret;
+}
+
+static enum nf_ct_tcp_action
+tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ unsigned int index, const struct sk_buff *skb,
+ unsigned int dataoff, const struct tcphdr *tcph,
+ const struct nf_hook_state *hook_state)
+{
+ struct ip_ct_tcp *state = &ct->proto.tcp;
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
- const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
- u16 win_raw;
+ bool in_recv_win, seq_ok;
s32 receiver_offset;
- bool res, in_recv_win;
+ u16 win_raw;
/*
* Get the required data from the packet.
@@ -493,44 +537,17 @@ static bool tcp_in_window(const struct nf_conn *ct,
ack -= receiver_offset;
sack -= receiver_offset;
- pr_debug("tcp_in_window: START\n");
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
if (sender->td_maxwin == 0) {
/*
* Initialize sender data.
*/
if (tcph->syn) {
- /*
- * SYN-ACK in reply to a SYN
- * or SYN from reply direction in simultaneous open.
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, dataoff, tcph, sender);
- /*
- * RFC 1323:
- * Both sides must send the Window Scale option
- * to enable window scaling in either direction.
- */
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
- && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
- receiver->td_scale = 0;
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win, dir);
if (!tcph->ack)
/* Simultaneous open */
- return true;
+ return NFCT_TCP_ACCEPT;
} else {
/*
* We are in the middle of a connection,
@@ -541,29 +558,39 @@ static bool tcp_in_window(const struct nf_conn *ct,
swin = win << sender->td_scale;
sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
- /*
- * We haven't seen traffic in the other direction yet
- * but we have to tweak window tracking to pass III
- * and IV until that happens.
- */
- if (receiver->td_maxwin == 0)
+ if (receiver->td_maxwin == 0) {
+ /* We haven't seen traffic in the other
+ * direction yet but we have to tweak window
+ * tracking to pass III and IV until that
+ * happens.
+ */
receiver->td_end = receiver->td_maxend = sack;
+ } else if (sack == receiver->td_end + 1) {
+ /* Likely a reply to a keepalive.
+ * Needed for III.
+ */
+ receiver->td_end++;
+ }
+
}
- } else if (((state->state == TCP_CONNTRACK_SYN_SENT
- && dir == IP_CT_DIR_ORIGINAL)
- || (state->state == TCP_CONNTRACK_SYN_RECV
- && dir == IP_CT_DIR_REPLY))
- && after(end, sender->td_end)) {
+ } else if (tcph->syn &&
+ after(end, sender->td_end) &&
+ (state->state == TCP_CONNTRACK_SYN_SENT ||
+ state->state == TCP_CONNTRACK_SYN_RECV)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
* not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
+ *
+ * Re-init state for this direction, just like for the first
+ * syn(-ack) reply, it might differ in seq, ack or tcp options.
*/
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win, dir);
- tcp_options(skb, dataoff, tcph, sender);
+ if (dir == IP_CT_DIR_REPLY && !tcph->ack)
+ return NFCT_TCP_ACCEPT;
}
if (!(tcph->ack)) {
@@ -587,113 +614,166 @@ static bool tcp_in_window(const struct nf_conn *ct,
*/
seq = end = sender->td_end;
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
+ seq_ok = before(seq, sender->td_maxend + 1);
+ if (!seq_ok) {
+ u32 overshot = end - sender->td_maxend + 1;
+ bool ack_ok;
+
+ ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
+ in_recv_win = receiver->td_maxwin &&
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ if (in_recv_win &&
+ ack_ok &&
+ overshot <= receiver->td_maxwin &&
+ before(sack, receiver->td_end + 1)) {
+ /* Work around TCPs that send more bytes than allowed by
+ * the receive window.
+ *
+ * If the (marked as invalid) packet is allowed to pass by
+ * the ruleset and the peer acks this data, then its possible
+ * all future packets will trigger 'ACK is over upper bound' check.
+ *
+ * Thus if only the sequence check fails then do update td_end so
+ * possible ACK for this data can update internal state.
+ */
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "%u bytes more than expected", overshot);
+ }
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "SEQ is over upper bound %u (over the window of the receiver)",
+ sender->td_maxend + 1);
+ }
+
+ if (!before(sack, receiver->td_end + 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "ACK is over upper bound %u (ACKed data not seen yet)",
+ receiver->td_end + 1);
/* Is the ending sequence in the receive window (if available)? */
in_recv_win = !receiver->td_maxwin ||
after(end, sender->td_end - receiver->td_maxwin - 1);
+ if (!in_recv_win)
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "SEQ is under lower bound %u (already ACKed data retransmitted)",
+ sender->td_end - receiver->td_maxwin - 1);
+ if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "ignored ACK under lower bound %u (possible overly delayed)",
+ receiver->td_end - MAXACKWINDOW(sender) - 1);
+
+ /* Take into account window scaling (RFC 1323). */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /* Update sender data. */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end)) {
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack)) {
+ sender->td_maxack = ack;
+ }
+ }
- pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- (in_recv_win ? 1 : 0),
- before(sack, receiver->td_end + 1),
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+ /* Update receiver data. */
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ /* Check retransmissions. */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir &&
+ state->last_seq == seq &&
+ state->last_ack == ack &&
+ state->last_end == end &&
+ state->last_win == win_raw) {
+ state->retrans++;
+ } else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win_raw;
+ state->retrans = 0;
+ }
+ }
- if (before(seq, sender->td_maxend + 1) &&
- in_recv_win &&
- before(sack, receiver->td_end + 1) &&
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
- /*
- * Take into account window scaling (RFC 1323).
- */
- if (!tcph->syn)
- win <<= sender->td_scale;
+ return NFCT_TCP_ACCEPT;
+}
- /*
- * Update sender data.
- */
- swin = win + (sack - ack);
- if (sender->td_maxwin < swin)
- sender->td_maxwin = swin;
- if (after(end, sender->td_end)) {
- sender->td_end = end;
- sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
- }
- if (tcph->ack) {
- if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
- sender->td_maxack = ack;
- sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
- } else if (after(ack, sender->td_maxack))
- sender->td_maxack = ack;
- }
+static void __cold nf_tcp_handle_invalid(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ int index,
+ const struct sk_buff *skb,
+ const struct nf_hook_state *hook_state)
+{
+ const unsigned int *timeouts;
+ const struct nf_tcp_net *tn;
+ unsigned int timeout;
+ u32 expires;
- /*
- * Update receiver data.
- */
- if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
- receiver->td_maxwin += end - sender->td_maxend;
- if (after(sack + win, receiver->td_maxend - 1)) {
- receiver->td_maxend = sack + win;
- if (win == 0)
- receiver->td_maxend++;
- }
- if (ack == receiver->td_end)
- receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return;
- /*
- * Check retransmissions.
- */
- if (index == TCP_ACK_SET) {
- if (state->last_dir == dir
- && state->last_seq == seq
- && state->last_ack == ack
- && state->last_end == end
- && state->last_win == win_raw)
- state->retrans++;
- else {
- state->last_dir = dir;
- state->last_seq = seq;
- state->last_ack = ack;
- state->last_end = end;
- state->last_win = win_raw;
- state->retrans = 0;
- }
- }
- res = true;
- } else {
- res = false;
- if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
- tn->tcp_be_liberal)
- res = true;
- if (!res) {
- nf_ct_l4proto_log_invalid(skb, ct,
- "%s",
- before(seq, sender->td_maxend + 1) ?
- in_recv_win ?
- before(sack, receiver->td_end + 1) ?
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
- : "ACK is under the lower bound (possible overly delayed ACK)"
- : "ACK is over the upper bound (ACKed data not seen yet)"
- : "SEQ is under the lower bound (already ACKed data retransmitted)"
- : "SEQ is over the upper bound (over the window of the receiver)");
- }
+ /* We don't want to have connections hanging around in ESTABLISHED
+ * state for long time 'just because' conntrack deemed a FIN/RST
+ * out-of-window.
+ *
+ * Shrink the timeout just like when there is unacked data.
+ * This speeds up eviction of 'dead' connections where the
+ * connection and conntracks internal state are out of sync.
+ */
+ switch (index) {
+ case TCP_RST_SET:
+ case TCP_FIN_SET:
+ break;
+ default:
+ return;
}
- pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+ if (ct->proto.tcp.last_dir != dir &&
+ (ct->proto.tcp.last_index == TCP_FIN_SET ||
+ ct->proto.tcp.last_index == TCP_RST_SET)) {
+ expires = nf_ct_expires(ct);
+ if (expires < 120 * HZ)
+ return;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
+
+ timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]);
+ if (expires > timeout) {
+ nf_ct_l4proto_log_invalid(skb, ct, hook_state,
+ "packet (index %d, dir %d) response for index %d lower timeout to %u",
+ index, dir, ct->proto.tcp.last_index, timeout);
- return res;
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+ }
+ } else {
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ }
}
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
@@ -715,7 +795,7 @@ static void tcp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
}
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
@@ -757,20 +837,19 @@ static bool tcp_error(const struct tcphdr *th,
static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff,
- const struct tcphdr *th)
+ const struct tcphdr *th,
+ const struct nf_hook_state *state)
{
enum tcp_conntrack new_state;
struct net *net = nf_ct_net(ct);
const struct nf_tcp_net *tn = nf_tcp_pernet(net);
- const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
- const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
/* Don't need lock here: this conntrack not in circulation yet */
new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
- pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ tcp_error_log(skb, state, "invalid new");
return false;
}
@@ -816,21 +895,68 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* tcp_packet will set them */
ct->proto.tcp.last_index = TCP_NONE_SET;
-
- pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- __func__,
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
return true;
}
-static bool nf_conntrack_tcp_established(const struct nf_conn *ct)
+static bool tcp_can_early_drop(const struct nf_conn *ct)
{
- return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED &&
- test_bit(IPS_ASSURED_BIT, &ct->status);
+ switch (ct->proto.tcp.state) {
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ case TCP_CONNTRACK_CLOSE:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+void nf_conntrack_tcp_set_closing(struct nf_conn *ct)
+{
+ enum tcp_conntrack old_state;
+ const unsigned int *timeouts;
+ u32 timeout;
+
+ if (!nf_ct_is_confirmed(ct))
+ return;
+
+ spin_lock_bh(&ct->lock);
+ old_state = ct->proto.tcp.state;
+ ct->proto.tcp.state = TCP_CONNTRACK_CLOSE;
+
+ if (old_state == TCP_CONNTRACK_CLOSE ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+ spin_unlock_bh(&ct->lock);
+ return;
+ }
+
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts) {
+ const struct nf_tcp_net *tn;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = tn->timeouts;
+ }
+
+ timeout = timeouts[TCP_CONNTRACK_CLOSE];
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+
+ spin_unlock_bh(&ct->lock);
+
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+}
+
+static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
+{
+ state->td_end = 0;
+ state->td_maxend = 0;
+ state->td_maxwin = 0;
+ state->td_maxack = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
}
/* Returns verdict for packet, or -1 for invalid. */
@@ -842,9 +968,9 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = nf_tcp_pernet(net);
- struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
+ enum nf_ct_tcp_action res;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
@@ -857,7 +983,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (tcp_error(th, skb, dataoff, state))
return -NF_ACCEPT;
- if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
+ if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state))
return -NF_ACCEPT;
spin_lock_bh(&ct->lock);
@@ -865,7 +991,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
new_state = tcp_conntracks[dir][index][old_state];
- tuple = &ct->tuplehash[dir].tuple;
switch (new_state) {
case TCP_CONNTRACK_SYN_SENT:
@@ -900,7 +1025,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return -NF_REPEAT;
return NF_DROP;
}
- /* Fall through */
+ fallthrough;
case TCP_CONNTRACK_IGNORE:
/* Ignored packets:
*
@@ -939,8 +1064,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
ct->proto.tcp.last_flags;
- memset(&ct->proto.tcp.seen[dir], 0,
- sizeof(struct ip_ct_tcp_state));
+ nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
break;
}
ct->proto.tcp.last_index = index;
@@ -980,9 +1104,18 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags |=
IP_CT_EXP_CHALLENGE_ACK;
}
+
+ /* possible challenge ack reply to syn */
+ if (old_state == TCP_CONNTRACK_SYN_SENT &&
+ index == TCP_ACK_SET &&
+ dir == IP_CT_DIR_REPLY)
+ ct->proto.tcp.last_ack = ntohl(th->ack_seq);
+
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
- "state %s ", tcp_conntrack_names[old_state]);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d ignored, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
/* Special case for SYN proxy: when the SYN to the server or
@@ -1001,10 +1134,11 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
}
/* Invalid packet */
- pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d invalid, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return -NF_ACCEPT;
case TCP_CONNTRACK_TIME_WAIT:
/* RFC5961 compliance cause stack to send "challenge-ACK"
@@ -1019,7 +1153,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
/* Detected RFC5961 challenge ACK */
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
return NF_ACCEPT; /* Don't change state */
}
break;
@@ -1038,13 +1172,33 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
if (index != TCP_RST_SET)
break;
- if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
+ /* If we are closing, tuple might have been re-used already.
+ * last_index, last_ack, and all other ct fields used for
+ * sequence/window validation are outdated in that case.
+ *
+ * As the conntrack can already be expired by GC under pressure,
+ * just skip validation checks.
+ */
+ if (tcp_can_early_drop(ct))
+ goto in_window;
+
+ /* td_maxack might be outdated if we let a SYN through earlier */
+ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+ ct->proto.tcp.last_index != TCP_SYN_SET) {
u32 seq = ntohl(th->seq);
- if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+ /* If we are not in established state and SEQ=0 this is most
+ * likely an answer to a SYN we let go through above (last_index
+ * can be updated due to out-of-order ACKs).
+ */
+ if (seq == 0 && !nf_conntrack_tcp_established(ct))
+ break;
+
+ if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+ !tn->tcp_ignore_invalid_rst) {
/* Invalid RST */
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
return -NF_ACCEPT;
}
@@ -1082,29 +1236,38 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
* segments we ignored. */
goto in_window;
}
+
+ /* Reset in response to a challenge-ack we let through earlier */
+ if (old_state == TCP_CONNTRACK_SYN_SENT &&
+ ct->proto.tcp.last_index == TCP_ACK_SET &&
+ ct->proto.tcp.last_dir == IP_CT_DIR_REPLY &&
+ ntohl(th->seq) == ct->proto.tcp.last_ack)
+ goto in_window;
+
break;
default:
/* Keep compilers happy. */
break;
}
- if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
- skb, dataoff, th)) {
+ res = tcp_in_window(ct, dir, index,
+ skb, dataoff, th, state);
+ switch (res) {
+ case NFCT_TCP_IGNORE:
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ case NFCT_TCP_INVALID:
+ nf_tcp_handle_invalid(ct, dir, index, skb, state);
spin_unlock_bh(&ct->lock);
return -NF_ACCEPT;
+ case NFCT_TCP_ACCEPT:
+ break;
}
in_window:
/* From now on we have got in-window packets */
ct->proto.tcp.last_index = index;
ct->proto.tcp.last_dir = dir;
- pr_debug("tcp_conntracks: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
- (th->syn ? 1 : 0), (th->ack ? 1 : 0),
- (th->fin ? 1 : 0), (th->rst ? 1 : 0),
- old_state, new_state);
-
ct->proto.tcp.state = new_state;
if (old_state != new_state
&& new_state == TCP_CONNTRACK_FIN_WAIT)
@@ -1142,6 +1305,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
+
+ if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+ /* do not renew timeout on SYN retransmit.
+ *
+ * Else port reuse by client or NAT middlebox can keep
+ * entry alive indefinitely (including nat info).
+ */
+ return NF_ACCEPT;
+ }
+
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
@@ -1152,7 +1325,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
- /* Set ASSURED if we see see valid ack in ESTABLISHED
+ /* Set ASSURED if we see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
set_bit(IPS_ASSURED_BIT, &ct->status);
@@ -1163,29 +1336,13 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.tcp.state) {
- case TCP_CONNTRACK_FIN_WAIT:
- case TCP_CONNTRACK_LAST_ACK:
- case TCP_CONNTRACK_TIME_WAIT:
- case TCP_CONNTRACK_CLOSE:
- case TCP_CONNTRACK_CLOSE_WAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
@@ -1195,8 +1352,13 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
ct->proto.tcp.seen[0].td_scale) ||
nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
ct->proto.tcp.seen[1].td_scale))
@@ -1211,8 +1373,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
@@ -1428,9 +1590,30 @@ void nf_conntrack_tcp_init_net(struct net *net)
* ->timeouts[0] contains 'new' timeout, like udp or icmp.
*/
tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
- tn->tcp_loose = nf_ct_tcp_loose;
- tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
- tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+
+ /* If it is set to zero, we disable picking up already established
+ * connections.
+ */
+ tn->tcp_loose = 1;
+
+ /* "Be conservative in what you do,
+ * be liberal in what you accept from others."
+ * If it's non-zero, we mark only out of window RST segments as INVALID.
+ */
+ tn->tcp_be_liberal = 0;
+
+ /* If it's non-zero, we turn off RST sequence number check */
+ tn->tcp_ignore_invalid_rst = 0;
+
+ /* Max number of the retransmitted packets without receiving an (acceptable)
+ * ACK from the destination. If this number is reached, a shorter timer
+ * will be started.
+ */
+ tn->tcp_max_retrans = 3;
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ tn->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 760ca2422816..0030fbe8885c 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -38,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg);
}
static bool udp_error(struct sk_buff *skb,
@@ -81,18 +80,6 @@ static bool udp_error(struct sk_buff *skb,
return false;
}
-static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
- struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- u32 extra_jiffies)
-{
- if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
- ct->status & IPS_NAT_CLASH))
- nf_ct_kill(ct);
- else
- nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
-}
-
/* Returns verdict for packet, and may modify conntracktype */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -101,6 +88,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
const struct nf_hook_state *state)
{
unsigned int *timeouts;
+ unsigned long status;
if (udp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -109,27 +97,34 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
if (!timeouts)
timeouts = udp_get_timeouts(nf_ct_net(ct));
- if (!nf_ct_is_confirmed(ct))
+ status = READ_ONCE(ct->status);
+ if ((status & IPS_CONFIRMED) == 0)
ct->proto.udp.stream_ts = 2 * HZ + jiffies;
/* If we've seen traffic both ways, this is some kind of UDP
* stream. Set Assured.
*/
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ if (status & IPS_SEEN_REPLY) {
unsigned long extra = timeouts[UDP_CT_UNREPLIED];
+ bool stream = false;
/* Still active after two seconds? Extend timeout. */
- if (time_after(jiffies, ct->proto.udp.stream_ts))
+ if (time_after(jiffies, ct->proto.udp.stream_ts)) {
extra = timeouts[UDP_CT_REPLIED];
+ stream = (status & IPS_ASSURED) == 0;
+ }
nf_ct_refresh_acct(ct, ctinfo, skb, extra);
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -139,8 +134,7 @@ static void udplite_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDPLITE, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg);
}
static bool udplite_error(struct sk_buff *skb,
@@ -206,12 +200,15 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_REPLIED]);
+
+ if (unlikely((ct->status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -276,6 +273,10 @@ void nf_conntrack_udp_init_net(struct net *net)
for (i = 0; i < UDP_CT_MAX; i++)
un->timeouts[i] = udp_timeouts[i];
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ un->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 1aebd6569d4e..13dc421fc4f5 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -34,10 +34,6 @@ MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
MODULE_DESCRIPTION("SANE connection tracking helper");
MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
-static char *sane_buffer;
-
-static DEFINE_SPINLOCK(nf_sane_lock);
-
#define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS];
static unsigned int ports_c;
@@ -67,14 +63,16 @@ static int help(struct sk_buff *skb,
unsigned int dataoff, datalen;
const struct tcphdr *th;
struct tcphdr _tcph;
- void *sb_ptr;
int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo);
struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple;
- struct sane_request *req;
struct sane_reply_net_start *reply;
+ union {
+ struct sane_request req;
+ struct sane_reply_net_start repl;
+ } buf;
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
@@ -92,56 +90,62 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
datalen = skb->len - dataoff;
-
- spin_lock_bh(&nf_sane_lock);
- sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
- BUG_ON(sb_ptr == NULL);
-
if (dir == IP_CT_DIR_ORIGINAL) {
+ const struct sane_request *req;
+
if (datalen != sizeof(struct sane_request))
- goto out;
+ return NF_ACCEPT;
+
+ req = skb_header_pointer(skb, dataoff, datalen, &buf.req);
+ if (!req)
+ return NF_ACCEPT;
- req = sb_ptr;
if (req->RPC_code != htonl(SANE_NET_START)) {
/* Not an interesting command */
- ct_sane_info->state = SANE_STATE_NORMAL;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
+ return NF_ACCEPT;
}
/* We're interested in the next reply */
- ct_sane_info->state = SANE_STATE_START_REQUESTED;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED);
+ return NF_ACCEPT;
}
+ /* IP_CT_DIR_REPLY */
+
/* Is it a reply to an uninteresting command? */
- if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
- goto out;
+ if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED)
+ return NF_ACCEPT;
/* It's a reply to SANE_NET_START. */
- ct_sane_info->state = SANE_STATE_NORMAL;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
if (datalen < sizeof(struct sane_reply_net_start)) {
pr_debug("NET_START reply too short\n");
- goto out;
+ return NF_ACCEPT;
}
- reply = sb_ptr;
+ datalen = sizeof(struct sane_reply_net_start);
+
+ reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl);
+ if (!reply)
+ return NF_ACCEPT;
+
if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */
pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status));
- goto out;
+ return NF_ACCEPT;
}
/* Invalid saned reply? Ignore it. */
if (reply->zero != 0)
- goto out;
+ return NF_ACCEPT;
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation");
- ret = NF_DROP;
- goto out;
+ return NF_DROP;
}
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
@@ -159,9 +163,6 @@ static int help(struct sk_buff *skb,
}
nf_ct_expect_put(exp);
-
-out:
- spin_unlock_bh(&nf_sane_lock);
return ret;
}
@@ -175,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
- kfree(sane_buffer);
}
static int __init nf_conntrack_sane_init(void)
@@ -184,10 +184,6 @@ static int __init nf_conntrack_sane_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master));
- sane_buffer = kmalloc(65536, GFP_KERNEL);
- if (!sane_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = SANE_PORT;
@@ -207,7 +203,6 @@ static int __init nf_conntrack_sane_init(void)
ret = nf_conntrack_helpers_register(sane, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(sane_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 3066449f8bd8..7ab2b25b57bc 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -232,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
- .len = sizeof(struct nf_conn_seqadj),
- .align = __alignof__(struct nf_conn_seqadj),
- .id = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
- return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
- nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index b83dc9bf0a5d..ca748f8dbff1 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -60,7 +60,7 @@ module_param(sip_external_media, int, 0600);
MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external "
"endpoints (default 0)");
-const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
static int string_len(const struct nf_conn *ct, const char *dptr,
@@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
return ret;
if (ret == 0)
break;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
*in_header = 0;
}
@@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
break;
if (ret == 0)
return ret;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
if (in_header)
@@ -611,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
start += strlen(name);
*val = simple_strtoul(start, &end, 0);
if (start == end)
- return 0;
+ return -1;
if (matchoff && matchlen) {
*matchoff = start - dptr;
*matchlen = end - start;
@@ -1229,6 +1229,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
struct nf_conntrack_expect *exp;
union nf_inet_addr *saddr, daddr;
const struct nf_nat_sip_hooks *hooks;
+ struct nf_conntrack_helper *helper;
__be16 port;
u8 proto;
unsigned int expires = 0;
@@ -1289,10 +1290,14 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
if (sip_direct_signalling)
saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ helper = rcu_dereference(nfct_help(ct)->helper);
+ if (!helper)
+ return NF_DROP;
+
nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
saddr, &daddr, proto, NULL, &port);
exp->timeout.expires = sip_timeout * HZ;
- exp->helper = nfct_help(ct)->helper;
+ exp->helper = helper;
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
hooks = rcu_dereference(nf_nat_sip_hooks);
@@ -1548,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1619,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 6a26299cb064..207b240b14e5 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -14,6 +14,7 @@
#include <linux/sysctl.h>
#endif
+#include <net/netfilter/nf_log.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
@@ -60,18 +61,13 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
ntohs(tuple->src.u.tcp.port),
ntohs(tuple->dst.u.tcp.port));
break;
- case IPPROTO_UDPLITE: /* fallthrough */
+ case IPPROTO_UDPLITE:
case IPPROTO_UDP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.udp.port),
ntohs(tuple->dst.u.udp.port));
break;
- case IPPROTO_DCCP:
- seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.dccp.port),
- ntohs(tuple->dst.u.dccp.port));
- break;
case IPPROTO_SCTP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.sctp.port),
@@ -98,69 +94,87 @@ struct ct_iter_state {
struct seq_net_private p;
struct hlist_nulls_head *hash;
unsigned int htable_size;
+ unsigned int skip_elems;
unsigned int bucket;
u_int64_t time_now;
};
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net,
+ struct ct_iter_state *st)
{
- struct ct_iter_state *st = seq->private;
+ struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int i;
- for (st->bucket = 0;
- st->bucket < st->htable_size;
- st->bucket++) {
- n = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- if (!is_a_nulls(n))
- return n;
- }
- return NULL;
-}
+ for (i = st->bucket; i < st->htable_size; i++) {
+ unsigned int skip = 0;
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
- struct hlist_nulls_node *head)
-{
- struct ct_iter_state *st = seq->private;
+restart:
+ hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct hlist_nulls_node *tmp = n;
+
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
+ if (++skip <= st->skip_elems)
+ continue;
- head = rcu_dereference(hlist_nulls_next_rcu(head));
- while (is_a_nulls(head)) {
- if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= st->htable_size)
- return NULL;
+ /* h should be returned, skip to nulls marker. */
+ while (!is_a_nulls(tmp))
+ tmp = rcu_dereference(hlist_nulls_next_rcu(tmp));
+
+ /* check if h is still linked to hash[i] */
+ if (get_nulls_value(tmp) != i) {
+ skip = 0;
+ goto restart;
+ }
+
+ st->skip_elems = skip;
+ st->bucket = i;
+ return h;
}
- head = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- }
- return head;
-}
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_nulls_node *head = ct_get_first(seq);
+ skip = 0;
+ if (get_nulls_value(n) != i)
+ goto restart;
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
+ st->skip_elems = 0;
+ }
+
+ st->bucket = i;
+ return NULL;
}
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct ct_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
st->time_now = ktime_get_real_ns();
rcu_read_lock();
nf_conntrack_get_ht(&st->hash, &st->htable_size);
- return ct_get_idx(seq, *pos);
+
+ if (*pos == 0) {
+ st->skip_elems = 0;
+ st->bucket = 0;
+ } else if (st->skip_elems) {
+ /* resume from last dumped entry */
+ st->skip_elems--;
+ }
+
+ return ct_get_next(net, st);
}
static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct ct_iter_state *st = s->private;
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- return ct_get_next(s, v);
+ return ct_get_next(net, st);
}
static void ct_seq_stop(struct seq_file *s, void *v)
@@ -172,17 +186,16 @@ static void ct_seq_stop(struct seq_file *s, void *v)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
+ struct lsm_context ctx;
int ret;
- u32 len;
- char *secctx;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return;
- seq_printf(s, "secctx=%s ", secctx);
+ seq_printf(s, "secctx=%s ", ctx.context);
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
}
#else
static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
@@ -262,16 +275,16 @@ static const char* l4proto_name(u16 proto)
case IPPROTO_ICMP: return "icmp";
case IPPROTO_TCP: return "tcp";
case IPPROTO_UDP: return "udp";
- case IPPROTO_DCCP: return "dccp";
case IPPROTO_GRE: return "gre";
case IPPROTO_SCTP: return "sctp";
case IPPROTO_UDPLITE: return "udplite";
+ case IPPROTO_ICMPV6: return "icmpv6";
}
return "unknown";
}
-static unsigned int
+static void
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
struct nf_conn_acct *acct;
@@ -279,14 +292,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
acct = nf_conn_acct_find(ct);
if (!acct)
- return 0;
+ return;
counter = acct->counter;
seq_printf(s, "packets=%llu bytes=%llu ",
(unsigned long long)atomic64_read(&counter[dir].packets),
(unsigned long long)atomic64_read(&counter[dir].bytes));
-
- return 0;
}
/* return 0 on success, 1 in case of error */
@@ -299,10 +310,16 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
WARN_ON(!ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use)))
return 0;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct)) {
+ struct ct_iter_state *st = s->private;
+
+ st->skip_elems--;
nf_ct_kill(ct);
goto release;
}
@@ -335,8 +352,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (seq_has_overflowed(s))
goto release;
- if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL);
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
seq_puts(s, "[UNREPLIED] ");
@@ -345,8 +361,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
- if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_REPLY);
if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status))
seq_puts(s, "[HW_OFFLOAD] ");
@@ -359,14 +374,14 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#if defined(CONFIG_NF_CONNTRACK_MARK)
- seq_printf(s, "mark=%u ", ct->mark);
+ seq_printf(s, "mark=%u ", READ_ONCE(ct->mark));
#endif
ct_show_secctx(s, ct);
ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
- seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+ seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use));
if (seq_has_overflowed(s))
goto release;
@@ -424,24 +439,26 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
static int ct_cpu_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_net(seq);
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
const struct ip_conntrack_stat *st = v;
+ unsigned int nr_conntracks;
if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
+ nr_conntracks = nf_conntrack_count(net);
+
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- 0,
+ st->clash_resolve,
st->found,
0,
st->invalid,
- st->ignore,
0,
0,
+ st->chaintoolong,
st->insert,
st->insert_failed,
st->drop,
@@ -507,22 +524,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
}
#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+u32 nf_conntrack_count(const struct net *net)
+{
+ const struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return atomic_read(&cnet->count);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_count);
+
/* Sysctl support */
#ifdef CONFIG_SYSCTL
-/* Log invalid packets of a given protocol */
-static int log_invalid_proto_min __read_mostly;
-static int log_invalid_proto_max __read_mostly = 255;
-
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
static int
-nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
+nf_conntrack_hash_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
+ /* module_param hashsize could have changed value */
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret < 0 || !write)
return ret;
@@ -535,6 +559,29 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
return ret;
}
+static int
+nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, i;
+
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (ret < 0 || !write)
+ return ret;
+
+ if (*(u8 *)table->data == 0)
+ return 0;
+
+ /* Load nf_log_syslog only if no logger is currently registered */
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ if (nf_log_is_registered(i))
+ return 0;
+ }
+ request_module("%s", "nf_log_syslog");
+
+ return 0;
+}
+
static struct ctl_table_header *nf_ct_netfilter_header;
enum nf_ct_sysctl_index {
@@ -545,7 +592,6 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_LOG_INVALID,
NF_SYSCTL_CT_EXPECT_MAX,
NF_SYSCTL_CT_ACCT,
- NF_SYSCTL_CT_HELPER,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
NF_SYSCTL_CT_EVENTS,
#endif
@@ -563,11 +609,18 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
+#endif
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+ NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
+#endif
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
#ifdef CONFIG_NF_CT_PROTO_SCTP
@@ -579,39 +632,27 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
- NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING,
- NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT,
- NF_SYSCTL_CT_PROTO_DCCP_LOOSE,
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
#endif
- __NF_SYSCTL_CT_LAST_SYSCTL,
+ NF_SYSCTL_CT_LAST_SYSCTL,
};
-#define NF_SYSCTL_CT_LAST_SYSCTL (__NF_SYSCTL_CT_LAST_SYSCTL + 1)
-
static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_MAX] = {
.procname = "nf_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
- .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -626,43 +667,34 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_CHECKSUM] = {
.procname = "nf_conntrack_checksum",
.data = &init_net.ct.sysctl_checksum,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_LOG_INVALID] = {
.procname = "nf_conntrack_log_invalid",
.data = &init_net.ct.sysctl_log_invalid,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
+ .proc_handler = nf_conntrack_log_invalid_sysctl,
},
[NF_SYSCTL_CT_EXPECT_MAX] = {
.procname = "nf_conntrack_expect_max",
.data = &nf_ct_expect_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_ACCT] = {
.procname = "nf_conntrack_acct",
.data = &init_net.ct.sysctl_acct,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- [NF_SYSCTL_CT_HELPER] = {
- .procname = "nf_conntrack_helper",
- .data = &init_net.ct.sysctl_auto_assign_helper,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -670,20 +702,20 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_EVENTS] = {
.procname = "nf_conntrack_events",
.data = &init_net.ct.sysctl_events,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
[NF_SYSCTL_CT_TIMESTAMP] = {
.procname = "nf_conntrack_timestamp",
.data = &init_net.ct.sysctl_tstamp,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -754,27 +786,43 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = {
+ .procname = "nf_flowtable_tcp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
.procname = "nf_conntrack_tcp_loose",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = {
.procname = "nf_conntrack_tcp_be_liberal",
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+ .procname = "nf_conntrack_tcp_ignore_invalid_rst",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
.procname = "nf_conntrack_tcp_max_retrans",
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = {
.procname = "nf_conntrack_udp_timeout",
@@ -788,6 +836,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = {
+ .procname = "nf_flowtable_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
.procname = "nf_conntrack_icmp_timeout",
.maxlen = sizeof(unsigned int),
@@ -849,64 +905,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED] = {
- .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
-#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
- .procname = "nf_conntrack_dccp_timeout_request",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = {
- .procname = "nf_conntrack_dccp_timeout_respond",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = {
- .procname = "nf_conntrack_dccp_timeout_partopen",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = {
- .procname = "nf_conntrack_dccp_timeout_open",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = {
- .procname = "nf_conntrack_dccp_timeout_closereq",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = {
- .procname = "nf_conntrack_dccp_timeout_closing",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = {
- .procname = "nf_conntrack_dccp_timeout_timewait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = {
- .procname = "nf_conntrack_dccp_loose",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = {
@@ -922,7 +920,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
#endif
- {}
};
static struct ctl_table nf_ct_netfilter_table[] = {
@@ -931,9 +928,10 @@ static struct ctl_table nf_ct_netfilter_table[] = {
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
- { }
};
static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
@@ -962,7 +960,13 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
XASSIGN(LOOSE, &tn->tcp_loose);
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+ XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
#undef XASSIGN
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
+#endif
+
}
static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
@@ -983,34 +987,10 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
XASSIGN(SHUTDOWN_RECD, sn);
XASSIGN(SHUTDOWN_ACK_SENT, sn);
XASSIGN(HEARTBEAT_SENT, sn);
- XASSIGN(HEARTBEAT_ACKED, sn);
#undef XASSIGN
#endif
}
-static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net,
- struct ctl_table *table)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
-#define XASSIGN(XNAME, dn) \
- table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \
- &(dn)->dccp_timeout[CT_DCCP_ ## XNAME]
-
- XASSIGN(REQUEST, dn);
- XASSIGN(RESPOND, dn);
- XASSIGN(PARTOPEN, dn);
- XASSIGN(OPEN, dn);
- XASSIGN(CLOSEREQ, dn);
- XASSIGN(CLOSING, dn);
- XASSIGN(TIMEWAIT, dn);
-#undef XASSIGN
-
- table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose;
-#endif
-}
-
static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
struct ctl_table *table)
{
@@ -1024,6 +1004,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
static int nf_conntrack_standalone_init_sysctl(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_udp_net *un = nf_udp_pernet(net);
struct ctl_table *table;
@@ -1034,11 +1015,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
if (!table)
return -ENOMEM;
- table[NF_SYSCTL_CT_COUNT].data = &net->ct.count;
+ table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
- table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
#endif
@@ -1050,27 +1030,25 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout;
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED];
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
+#endif
nf_conntrack_standalone_init_tcp_sysctl(net, table);
nf_conntrack_standalone_init_sctp_sysctl(net, table);
- nf_conntrack_standalone_init_dccp_sysctl(net, table);
nf_conntrack_standalone_init_gre_sysctl(net, table);
- /* Don't allow unprivileged users to alter certain sysctls */
- if (net->user_ns != &init_user_ns) {
+ /* Don't allow non-init_net ns to alter global sysctls */
+ if (!net_eq(&init_net, net)) {
table[NF_SYSCTL_CT_MAX].mode = 0444;
table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
- table[NF_SYSCTL_CT_HELPER].mode = 0444;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- table[NF_SYSCTL_CT_EVENTS].mode = 0444;
-#endif
- table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
- } else if (!net_eq(&init_net, net)) {
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
}
- net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
- if (!net->ct.sysctl_header)
+ cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter",
+ table,
+ ARRAY_SIZE(nf_ct_sysctl_table));
+ if (!cnet->sysctl_header)
goto out_unregister_netfilter;
return 0;
@@ -1082,10 +1060,11 @@ out_unregister_netfilter:
static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
- struct ctl_table *table;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ const struct ctl_table *table;
- table = net->ct.sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.sysctl_header);
+ table = cnet->sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(cnet->sysctl_header);
kfree(table);
}
#else
@@ -1180,11 +1159,12 @@ static int __init nf_conntrack_standalone_init(void)
nf_conntrack_htable_size_user = nf_conntrack_htable_size;
#endif
+ nf_conntrack_init_end();
+
ret = register_pernet_subsys(&nf_conntrack_net_ops);
if (ret < 0)
goto out_pernet;
- nf_conntrack_init_end();
return 0;
out_pernet:
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 14387e0b8008..0cc584d3dbb1 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -22,19 +22,21 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
static int untimeout(struct nf_conn *ct, void *timeout)
{
struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ if (timeout_ext) {
+ const struct nf_ct_timeout *t;
+
+ t = rcu_access_pointer(timeout_ext->timeout);
+
+ if (!timeout || t == timeout)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
/* We are not intended to delete this conntrack. */
return 0;
@@ -42,37 +44,41 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = timeout,
+ };
+
+ nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
{
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
- if (timeout_put)
- timeout_put(timeout);
+ if (h)
+ h->timeout_put(timeout);
}
int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
u8 l3num, u8 l4num, const char *timeout_name)
{
- typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+ const struct nf_ct_timeout_hooks *h;
struct nf_ct_timeout *timeout;
struct nf_conn_timeout *timeout_ext;
const char *errmsg = NULL;
int ret = 0;
rcu_read_lock();
- timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
- if (!timeout_find_get) {
+ h = rcu_dereference(nf_ct_timeout_hook);
+ if (!h) {
ret = -ENOENT;
errmsg = "Timeout policy base is empty";
goto out;
}
- timeout = timeout_find_get(net, timeout_name);
+ timeout = h->timeout_find_get(net, timeout_name);
if (!timeout) {
ret = -ENOENT;
pr_info_ratelimited("No such timeout policy \"%s\"\n",
@@ -119,37 +125,22 @@ EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
void nf_ct_destroy_timeout(struct nf_conn *ct)
{
struct nf_conn_timeout *timeout_ext;
- typeof(nf_ct_timeout_put_hook) timeout_put;
+ const struct nf_ct_timeout_hooks *h;
rcu_read_lock();
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
+ h = rcu_dereference(nf_ct_timeout_hook);
- if (timeout_put) {
+ if (h) {
timeout_ext = nf_ct_timeout_find(ct);
if (timeout_ext) {
- timeout_put(timeout_ext->timeout);
+ struct nf_ct_timeout *t;
+
+ t = rcu_dereference(timeout_ext->timeout);
+ if (t)
+ h->timeout_put(t);
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
}
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
-
-static const struct nf_ct_ext_type timeout_extend = {
- .len = sizeof(struct nf_conn_timeout),
- .align = __alignof__(struct nf_conn_timeout),
- .id = NF_CT_EXT_TIMEOUT,
-};
-
-int nf_conntrack_timeout_init(void)
-{
- int ret = nf_ct_extend_register(&timeout_extend);
- if (ret < 0)
- pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
- return ret;
-}
-
-void nf_conntrack_timeout_fini(void)
-{
- nf_ct_extend_unregister(&timeout_extend);
-}
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index f656d393fa92..9e43a0a59e73 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -19,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-static const struct nf_ct_ext_type tstamp_extend = {
- .len = sizeof(struct nf_conn_tstamp),
- .align = __alignof__(struct nf_conn_tstamp),
- .id = NF_CT_EXT_TSTAMP,
-};
-
void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
}
-
-int nf_conntrack_tstamp_init(void)
-{
- int ret;
- ret = nf_ct_extend_register(&tstamp_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
- nf_ct_extend_unregister(&tstamp_extend);
-}
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index f108a76925dd..fab8b9011098 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -13,13 +13,45 @@
#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+#define NF_RECURSION_LIMIT 2
+
+#ifndef CONFIG_PREEMPT_RT
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+}
+#else
+
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return &current->net_xmit.nf_dup_skb_recursion;
+}
+
+#endif
+
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+ enum nf_dev_hooks hook)
{
- if (skb_mac_header_was_set(skb))
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
+
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
+ goto err;
+
+ if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+ if (skb_cow_head(skb, skb->mac_len))
+ goto err;
+
skb_push(skb, skb->mac_len);
+ }
skb->dev = dev;
+ skb_clear_tstamp(skb);
+ (*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
+ (*nf_dup_skb_recursion)--;
+ return;
+err:
+ kfree_skb(skb);
}
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
@@ -32,7 +64,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
}
- nf_do_netdev_egress(pkt->skb, dev);
+ nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
@@ -47,7 +79,7 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
skb = skb_clone(pkt->skb, GFP_ATOMIC);
if (skb)
- nf_do_netdev_egress(skb, dev);
+ nf_do_netdev_egress(skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
@@ -73,3 +105,4 @@ EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter packet duplication support");
diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c
new file mode 100644
index 000000000000..4a5f5195f2d2
--- /dev/null
+++ b/net/netfilter/nf_flow_table_bpf.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Flow Table Helpers for XDP hook
+ *
+ * These are called from the XDP programs.
+ * Note that it is allowed to break compatibility for these functions since
+ * the interface they are exposed through to BPF programs is explicitly
+ * unstable.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <net/xdp.h>
+
+/* bpf_flowtable_opts - options for bpf flowtable helpers
+ * @error: out parameter, set for any encountered error
+ */
+struct bpf_flowtable_opts {
+ s32 error;
+};
+
+enum {
+ NF_BPF_FLOWTABLE_OPTS_SZ = 4,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_flow_table BTF");
+
+__bpf_kfunc_start_defs();
+
+static struct flow_offload_tuple_rhash *
+bpf_xdp_flow_tuple_lookup(struct net_device *dev,
+ struct flow_offload_tuple *tuple, __be16 proto)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *nf_flow_table;
+ struct flow_offload *nf_flow;
+
+ nf_flow_table = nf_flowtable_by_dev(dev);
+ if (!nf_flow_table)
+ return ERR_PTR(-ENOENT);
+
+ tuplehash = flow_offload_lookup(nf_flow_table, tuple);
+ if (!tuplehash)
+ return ERR_PTR(-ENOENT);
+
+ nf_flow = container_of(tuplehash, struct flow_offload,
+ tuplehash[tuplehash->tuple.dir]);
+ flow_offload_refresh(nf_flow_table, nf_flow, false);
+
+ return tuplehash;
+}
+
+__bpf_kfunc struct flow_offload_tuple_rhash *
+bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple,
+ struct bpf_flowtable_opts *opts, u32 opts_len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct flow_offload_tuple tuple = {
+ .iifidx = fib_tuple->ifindex,
+ .l3proto = fib_tuple->family,
+ .l4proto = fib_tuple->l4_protocol,
+ .src_port = fib_tuple->sport,
+ .dst_port = fib_tuple->dport,
+ };
+ struct flow_offload_tuple_rhash *tuplehash;
+ __be16 proto;
+
+ if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ switch (fib_tuple->family) {
+ case AF_INET:
+ tuple.src_v4.s_addr = fib_tuple->ipv4_src;
+ tuple.dst_v4.s_addr = fib_tuple->ipv4_dst;
+ proto = htons(ETH_P_IP);
+ break;
+ case AF_INET6:
+ tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src;
+ tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst;
+ proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ opts->error = -EAFNOSUPPORT;
+ return NULL;
+ }
+
+ tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto);
+ if (IS_ERR(tuplehash)) {
+ opts->error = PTR_ERR(tuplehash);
+ return NULL;
+ }
+
+ return tuplehash;
+}
+
+__diag_pop()
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_ft_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_KFUNCS_END(nf_ft_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_flow_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ft_kfunc_set,
+};
+
+int nf_flow_register_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_flow_kfunc_set);
+}
+EXPORT_SYMBOL_GPL(nf_flow_register_bpf);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 6a3034f84ab6..06e8251a6644 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -39,22 +39,28 @@ flow_offload_fill_dir(struct flow_offload *flow,
ft->l3proto = ctt->src.l3num;
ft->l4proto = ctt->dst.protonum;
- ft->src_port = ctt->src.u.tcp.port;
- ft->dst_port = ctt->dst.u.tcp.port;
+
+ switch (ctt->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+ break;
+ }
}
struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
{
struct flow_offload *flow;
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(nf_ct_is_dying(ct)))
return NULL;
flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
if (!flow)
- goto err_ct_refcnt;
+ return NULL;
+ refcount_inc(&ct->ct_general.use);
flow->ct = ct;
flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
@@ -66,111 +72,184 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
__set_bit(NF_FLOW_DNAT, &flow->flags);
return flow;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
-err_ct_refcnt:
- nf_ct_put(ct);
+static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
+{
+ if (flow_tuple->l3proto == NFPROTO_IPV6)
+ return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
+
+ return 0;
+}
+
+static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ route->tuple[dir].dst = NULL;
- return NULL;
+ return dst;
}
-EXPORT_SYMBOL_GPL(flow_offload_alloc);
static int flow_offload_fill_route(struct flow_offload *flow,
- const struct nf_flow_route *route,
+ struct nf_flow_route *route,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
- struct dst_entry *dst = route->tuple[dir].dst;
-
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
+ struct dst_entry *dst = nft_route_dst_fetch(route, dir);
+ int i, j = 0;
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
break;
case NFPROTO_IPV6:
- flow_tuple->mtu = ip6_dst_mtu_forward(dst);
+ flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
break;
}
- flow_tuple->iifidx = other_dst->dev->ifindex;
- flow_tuple->dst_cache = dst;
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
+ j++;
+ }
+
+ flow_tuple->tun = route->tuple[dir].in.tun;
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ dst_release(dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ flow_tuple->ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->dst_cache = dst;
+ flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
return 0;
}
-int flow_offload_route_init(struct flow_offload *flow,
- const struct nf_flow_route *route)
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
{
- int err;
-
- err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
- if (err < 0)
- return err;
-
- err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
- if (err < 0)
- goto err_route_reply;
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+void flow_offload_route_init(struct flow_offload *flow,
+ struct nf_flow_route *route)
+{
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
flow->type = NF_FLOW_OFFLOAD_ROUTE;
-
- return 0;
-
-err_route_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-
- return err;
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
- tcp->state = TCP_CONNTRACK_ESTABLISHED;
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
+
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
+{
+ struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+ spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ /* Conntrack state is outdated due to offload bypass.
+ * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
+ * TCP reset validation will fail.
+ */
tcp->seen[0].td_maxwin = 0;
+ tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
tcp->seen[1].td_maxwin = 0;
+ tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ spin_unlock_bh(&ct->lock);
}
-#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
-#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
-
-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct flow_offload *flow)
{
- const struct nf_conntrack_l4proto *l4proto;
+ struct nf_conn *ct = flow->ct;
+ struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
- unsigned int timeout;
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
+ s32 timeout;
+
+ if (l4num == IPPROTO_TCP) {
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
+
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
- l4proto = nf_ct_l4proto_find(l4num);
- if (!l4proto)
- return;
+ } else if (l4num == IPPROTO_UDP) {
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
+ enum udp_conntrack state =
+ test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ UDP_CT_REPLIED : UDP_CT_UNREPLIED;
- if (l4num == IPPROTO_TCP)
- timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
- else if (l4num == IPPROTO_UDP)
- timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
- else
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
+ } else {
return;
+ }
- if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
- ct->timeout = nfct_time_stamp + timeout;
-}
+ if (expired)
+ timeout -= offload_timeout;
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
-{
- if (nf_ct_protonum(ct) == IPPROTO_TCP)
- flow_offload_fixup_tcp(&ct->proto.tcp);
-}
+ if (timeout < 0)
+ timeout = 0;
-static void flow_offload_fixup_ct(struct nf_conn *ct)
-{
- flow_offload_fixup_ct_state(ct);
- flow_offload_fixup_ct_timeout(ct);
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
{
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
}
void flow_offload_free(struct flow_offload *flow)
@@ -191,14 +270,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple *tuple = data;
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -207,7 +286,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
return 1;
return 0;
@@ -221,11 +300,30 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = {
.automatic_shrinking = true,
};
+unsigned long flow_offload_get_timeout(struct flow_offload *flow)
+{
+ unsigned long timeout = NF_FLOW_TIMEOUT;
+ struct net *net = nf_ct_net(flow->ct);
+ int l4num = nf_ct_protonum(flow->ct);
+
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ }
+
+ return timeout;
+}
+
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
int err;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
err = rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[0].node,
@@ -243,6 +341,8 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
+ nf_ct_refresh(flow->ct, NF_CT_DAY);
+
if (nf_flowtable_hw_offload(flow_table)) {
__set_bit(NF_FLOW_HW, &flow->flags);
nf_flow_offload_add(flow_table, flow);
@@ -253,23 +353,24 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
EXPORT_SYMBOL_GPL(flow_offload_add);
void flow_offload_refresh(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
+ struct flow_offload *flow, bool force)
{
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ u32 timeout;
- if (likely(!nf_flowtable_hw_offload(flow_table) ||
- !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags)))
+ timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
+ if (force || timeout - READ_ONCE(flow->timeout) > HZ)
+ WRITE_ONCE(flow->timeout, timeout);
+ else
+ return;
+
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
return;
nf_flow_offload_add(flow_table, flow);
}
EXPORT_SYMBOL_GPL(flow_offload_refresh);
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return nf_flow_timeout_delta(flow->timeout) <= 0;
-}
-
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
@@ -279,22 +380,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
-
- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
-
- if (nf_flow_has_expired(flow))
- flow_offload_fixup_ct(flow->ct);
- else
- flow_offload_fixup_ct_timeout(flow->ct);
-
flow_offload_free(flow);
}
void flow_offload_teardown(struct flow_offload *flow)
{
- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
-
- flow_offload_fixup_ct_state(flow->ct);
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+ if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -325,7 +418,8 @@ EXPORT_SYMBOL_GPL(flow_offload_lookup);
static int
nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -349,7 +443,7 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
- iter(flow, data);
+ iter(flow_table, flow, data);
}
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
@@ -357,14 +451,124 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
-static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
+static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
+ const struct flow_offload *flow)
{
- struct nf_flowtable *flow_table = data;
+ return flow_table->type->gc && flow_table->type->gc(flow);
+}
- if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct))
- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return: number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
+{
+ u8 state = READ_ONCE(ct->proto.tcp.state);
- if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
+{
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
+static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
+{
+ bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
+
+ if (nf_flow_has_expired(flow) ||
+ nf_ct_is_dying(flow->ct) ||
+ nf_flow_custom_gc(flow_table, flow)) {
+ flow_offload_teardown(flow);
+ teardown = true;
+ } else if (!teardown) {
+ nf_flow_table_extend_ct_timeout(flow->ct);
+ }
+
+ if (teardown) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
nf_flow_offload_del(flow_table, flow);
@@ -373,129 +577,72 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
} else {
flow_offload_del(flow_table, flow);
}
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
nf_flow_offload_stats(flow_table, flow);
}
}
+void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
+}
+
static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_gc_run(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
- flow_setup_cb_t *cb, void *cb_priv)
-{
- struct flow_block *block = &flow_table->flow_block;
- struct flow_block_cb *block_cb;
- int err = 0;
-
- down_write(&flow_table->flow_block_lock);
- block_cb = flow_block_cb_lookup(block, cb, cb_priv);
- if (block_cb) {
- err = -EEXIST;
- goto unlock;
- }
-
- block_cb = flow_block_cb_alloc(cb, cb_priv, cb_priv, NULL);
- if (IS_ERR(block_cb)) {
- err = PTR_ERR(block_cb);
- goto unlock;
- }
-
- list_add_tail(&block_cb->list, &block->cb_list);
-
-unlock:
- up_write(&flow_table->flow_block_lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb);
-
-void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,
- flow_setup_cb_t *cb, void *cb_priv)
-{
- struct flow_block *block = &flow_table->flow_block;
- struct flow_block_cb *block_cb;
-
- down_write(&flow_table->flow_block_lock);
- block_cb = flow_block_cb_lookup(block, cb, cb_priv);
- if (block_cb) {
- list_del(&block_cb->list);
- flow_block_cb_free(block_cb);
- } else {
- WARN_ON(true);
- }
- up_write(&flow_table->flow_block_lock);
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb);
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
-
- return 0;
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
- new_port, true);
+ new_port, false);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -509,25 +656,19 @@ int nf_flow_snat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -541,11 +682,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
@@ -553,7 +692,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
{
int err;
- INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
flow_block_init(&flowtable->flow_block);
init_rwsem(&flowtable->flow_block_lock);
@@ -573,7 +712,8 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
}
EXPORT_SYMBOL_GPL(nf_flow_table_init);
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
@@ -614,24 +754,89 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_unlock(&flowtable_lock);
cancel_delayed_work_sync(&flow_table->gc_work);
- nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
nf_flow_table_offload_flush(flow_table);
- if (nf_flowtable_hw_offload(flow_table))
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step,
- flow_table);
+ /* ... no more pending work after this stage ... */
+ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+ nf_flow_table_gc_run(flow_table);
+ nf_flow_table_offload_flush_cleanup(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int nf_flow_table_init_net(struct net *net)
+{
+ net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
+ return net->ft.stat ? 0 : -ENOMEM;
+}
+
+static void nf_flow_table_fini_net(struct net *net)
+{
+ free_percpu(net->ft.stat);
+}
+
+static int nf_flow_table_pernet_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_flow_table_init_net(net);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ return 0;
+
+out_proc:
+ nf_flow_table_fini_net(net);
+ return ret;
+}
+
+static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_flow_table_fini_proc(net);
+ nf_flow_table_fini_net(net);
+ }
+}
+
+static struct pernet_operations nf_flow_table_net_ops = {
+ .init = nf_flow_table_pernet_init,
+ .exit_batch = nf_flow_table_pernet_exit,
+};
+
static int __init nf_flow_table_module_init(void)
{
- return nf_flow_table_offload_init();
+ int ret;
+
+ ret = register_pernet_subsys(&nf_flow_table_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_offload_init();
+ if (ret)
+ goto out_offload;
+
+ ret = nf_flow_register_bpf();
+ if (ret)
+ goto out_bpf;
+
+ return 0;
+
+out_bpf:
+ nf_flow_table_offload_exit();
+out_offload:
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
+ return ret;
}
static void __exit nf_flow_table_module_exit(void)
{
nf_flow_table_offload_exit();
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
}
module_init(nf_flow_table_module_init);
@@ -639,3 +844,4 @@ module_exit(nf_flow_table_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter flow table module");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 88bedf1ff1ae..b0f199171932 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -6,12 +6,33 @@
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
+#include <linux/if_vlan.h>
static unsigned int
nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return NF_ACCEPT;
+
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (!nf_flow_pppoe_proto(skb, &proto))
+ return NF_ACCEPT;
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
case htons(ETH_P_IP):
return nf_flow_offload_ip_hook(priv, skb, state);
case htons(ETH_P_IPV6):
@@ -22,7 +43,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
}
static int nf_flow_rule_route_inet(struct net *net,
- const struct flow_offload *flow,
+ struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
@@ -54,8 +75,30 @@ static struct nf_flowtable_type flowtable_inet = {
.owner = THIS_MODULE,
};
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+ .family = NFPROTO_IPV6,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv6,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ipv6_hook,
+ .owner = THIS_MODULE,
+};
+
static int __init nf_flow_inet_module_init(void)
{
+ nft_register_flowtable_type(&flowtable_ipv4);
+ nft_register_flowtable_type(&flowtable_ipv6);
nft_register_flowtable_type(&flowtable_inet);
return 0;
@@ -64,6 +107,8 @@ static int __init nf_flow_inet_module_init(void)
static void __exit nf_flow_inet_module_exit(void)
{
nft_unregister_flowtable_type(&flowtable_inet);
+ nft_unregister_flowtable_type(&flowtable_ipv6);
+ nft_unregister_flowtable_type(&flowtable_ipv4);
}
module_init(nf_flow_inet_module_init);
@@ -71,4 +116,7 @@ module_exit(nf_flow_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */
+MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module");
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index a3bca758b849..78883343e5d6 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -7,6 +7,8 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <net/gso.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -25,42 +27,33 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
if (proto != IPPROTO_TCP)
return 0;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
}
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
@@ -68,31 +61,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
+static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
{
switch (iph->protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -107,17 +94,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
iph->daddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -132,31 +117,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
iph->saddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- unsigned int thoff, enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir,
+ struct iphdr *iph)
{
- struct iphdr *iph = ip_hdr(skb);
-
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
- return -1;
-
- iph = ip_hdr(skb);
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir);
+ }
}
static bool ip_has_options(unsigned int thoff)
@@ -164,44 +142,122 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+static void nf_flow_tuple_encap(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ __be16 inner_proto = skb->protocol;
+ struct vlan_ethhdr *veth;
+ struct pppoe_hdr *phdr;
+ struct iphdr *iph;
+ u16 offset = 0;
+ int i = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ tuple->encap[i].id = skb_vlan_tag_get(skb);
+ tuple->encap[i].proto = skb->vlan_proto;
+ i++;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
+ tuple->encap[i].proto = skb->protocol;
+ inner_proto = veth->h_vlan_encapsulated_proto;
+ offset += VLAN_HLEN;
+ break;
+ case htons(ETH_P_PPP_SES):
+ phdr = (struct pppoe_hdr *)skb_network_header(skb);
+ tuple->encap[i].id = ntohs(phdr->sid);
+ tuple->encap[i].proto = skb->protocol;
+ inner_proto = *((__be16 *)(phdr + 1));
+ offset += PPPOE_SES_HLEN;
+ break;
+ }
+
+ if (inner_proto == htons(ETH_P_IP)) {
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ if (iph->protocol == IPPROTO_IPIP) {
+ tuple->tun.dst_v4.s_addr = iph->daddr;
+ tuple->tun.src_v4.s_addr = iph->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPIP;
+ }
+ }
+}
+
+struct nf_flowtable_ctx {
+ const struct net_device *in;
+ u32 offset;
+ u32 hdrsize;
+};
+
+static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
+ u8 ipproto;
- if (!pskb_may_pull(skb, sizeof(*iph)))
+ if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset))
return -1;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
+ thoff += ctx->offset;
+
+ ipproto = iph->protocol;
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ ctx->hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ ctx->hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
return -1;
+ }
if (iph->ttl <= 1)
return -1;
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
return -1;
- iph = ip_hdr(skb);
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v4.s_addr = iph->saddr;
tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
- tuple->iifidx = dev->ifindex;
+ tuple->l4proto = ipproto;
+ tuple->iifidx = ctx->in->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -218,12 +274,13 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
-static int nf_flow_offload_dst_check(struct dst_entry *dst)
+static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple)
{
- if (unlikely(dst_xfrm(dst)))
- return dst_check(dst, 0) ? 0 : -1;
+ if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH &&
+ tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM)
+ return true;
- return 0;
+ return dst_check(tuple->dst_cache, tuple->dst_cookie);
}
static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
@@ -236,106 +293,402 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- struct rtable *rt;
- unsigned int thoff;
struct iphdr *iph;
- __be32 nexthop;
+ u16 size;
- if (skb->protocol != htons(ETH_P_IP))
- return NF_ACCEPT;
+ if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+ return false;
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
+ iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+ size = iph->ihl << 2;
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
+ if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
+ return false;
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (iph->protocol == IPPROTO_IPIP)
+ *psize += size;
+
+ return true;
+}
+
+static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+
+ if (iph->protocol != IPPROTO_IPIP)
+ return;
+
+ skb_pull(skb, iph->ihl << 2);
+ skb_reset_network_header(skb);
+}
+
+static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
+ u32 *offset)
+{
+ __be16 inner_proto = skb->protocol;
+ struct vlan_ethhdr *veth;
+ bool ret = false;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return false;
+
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ if (veth->h_vlan_encapsulated_proto == proto) {
+ *offset += VLAN_HLEN;
+ inner_proto = proto;
+ ret = true;
+ }
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_proto(skb, &inner_proto) &&
+ inner_proto == proto) {
+ *offset += PPPOE_SES_HLEN;
+ ret = true;
+ }
+ break;
+ }
+
+ if (inner_proto == htons(ETH_P_IP))
+ ret = nf_flow_ip4_tunnel_proto(skb, offset);
+
+ return ret;
+}
+
+static void nf_flow_encap_pop(struct sk_buff *skb,
+ struct flow_offload_tuple_rhash *tuplehash)
+{
+ struct vlan_hdr *vlan_hdr;
+ int i;
+
+ for (i = 0; i < tuplehash->tuple.encap_num; i++) {
+ if (skb_vlan_tag_present(skb)) {
+ __vlan_hwaccel_clear_tag(skb);
+ continue;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ vlan_hdr = (struct vlan_hdr *)skb->data;
+ __skb_pull(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vlan_hdr);
+ skb_reset_network_header(skb);
+ break;
+ case htons(ETH_P_PPP_SES):
+ skb->protocol = __nf_flow_pppoe_proto(skb);
+ skb_pull(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+ break;
+ }
+ }
+
+ if (skb->protocol == htons(ETH_P_IP))
+ nf_flow_ip4_tunnel_pop(skb);
+}
+
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+};
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ skb->dev = xmit->outdev;
+ dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
+ dev_queue_xmit(skb);
+
+ return NF_STOLEN;
+}
+
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table, struct sk_buff *skb)
+{
+ struct flow_offload_tuple tuple = {};
+
+ if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
+static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
+{
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ unsigned int thoff, mtu;
+ struct iphdr *iph;
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
-
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (flow->tuplehash[!dir].tuple.tun_num)
+ mtu -= sizeof(*iph);
- thoff = ip_hdr(skb)->ihl * 4;
- if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
- return NF_ACCEPT;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ return 0;
- flow_offload_refresh(flow_table, flow);
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = (iph->ihl * 4) + ctx->offset;
+ if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
+ return 0;
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
- return NF_ACCEPT;
+ return 0;
}
- if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
- return NF_DROP;
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
+
+ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= ctx->offset;
iph = ip_hdr(skb);
+ nf_flow_nat_ip(flow, skb, thoff, dir, iph);
+
ip_decrease_ttl(iph);
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ return 1;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+{
+ int data_len = skb->len + sizeof(__be16);
+ struct ppp_hdr {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *ph;
+ __be16 proto;
+
+ if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ return -1;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ proto = htons(PPP_IP);
+ break;
+ case htons(ETH_P_IPV6):
+ proto = htons(PPP_IPV6);
+ break;
+ default:
+ return -1;
+ }
+
+ __skb_push(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+
+ ph = (struct ppp_hdr *)(skb->data);
+ ph->hdr.ver = 1;
+ ph->hdr.type = 1;
+ ph->hdr.code = 0;
+ ph->hdr.sid = htons(id);
+ ph->hdr.length = htons(data_len);
+ ph->proto = proto;
+ skb->protocol = htons(ETH_P_PPP_SES);
+
+ return 0;
+}
+
+static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+ struct rtable *rt = dst_rtable(tuple->dst_cache);
+ u8 tos = iph->tos, ttl = iph->ttl;
+ __be16 frag_off = iph->frag_off;
+ u32 headroom = sizeof(*iph);
+ int err;
+
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4);
+ if (err)
+ return err;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ err = skb_cow_head(skb, headroom);
+ if (err)
+ return err;
+
+ skb_scrub_packet(skb, true);
+ skb_clear_hash_if_not_l4(skb);
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off;
+ iph->protocol = tuple->tun.l3_proto;
+ iph->tos = tos;
+ iph->daddr = tuple->tun.src_v4.s_addr;
+ iph->saddr = tuple->tun.dst_v4.s_addr;
+ iph->ttl = ttl;
+ iph->tot_len = htons(skb->len);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+ ip_send_check(iph);
+
+ *ip_daddr = tuple->tun.src_v4.s_addr;
+
+ return 0;
+}
+
+static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ if (tuple->tun_num)
+ return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr);
+
+ return 0;
+}
+
+static int nf_flow_encap_push(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ int i;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ switch (tuple->encap[i].proto) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ if (skb_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple *other_tuple;
+ enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
+ struct flow_offload *flow;
+ struct neighbour *neigh;
+ struct rtable *rt;
+ __be32 ip_daddr;
+ int ret;
+
+ tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb);
+ if (!tuplehash)
+ return NF_ACCEPT;
+
+ ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
+ return NF_ACCEPT;
+
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip_daddr = other_tuple->src_v4.s_addr;
- return NF_STOLEN;
+ if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
+ return NF_DROP;
+
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
+ return NF_DROP;
+
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
+ skb_dst_set_noref(skb, &rt->dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr,
+ struct ipv6hdr *ip6h)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
-
- return 0;
}
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
@@ -343,32 +696,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -383,17 +730,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
ip6h->daddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -408,139 +753,231 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
ip6h->saddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
- return -1;
-
- ip6h = ipv6_hdr(skb);
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
}
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
+ u8 nexthdr;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ thoff = sizeof(*ip6h) + ctx->offset;
+ if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
+ nexthdr = ip6h->nexthdr;
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ ctx->hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ ctx->hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
return -1;
+ }
if (ip6h->hop_limit <= 1)
return -1;
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
return -1;
- ip6h = ipv6_hdr(skb);
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v6 = ip6h->saddr;
tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
- tuple->iifidx = dev->ifindex;
+ tuple->l4proto = nexthdr;
+ tuple->iifidx = ctx->in->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
enum flow_offload_tuple_dir dir;
- const struct in6_addr *nexthop;
struct flow_offload *flow;
- struct net_device *outdev;
+ unsigned int thoff, mtu;
struct ipv6hdr *ip6h;
- struct rt6_info *rt;
-
- if (skb->protocol != htons(ETH_P_IPV6))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
-
- if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
- sizeof(*ip6h)))
- return NF_ACCEPT;
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ return 0;
- flow_offload_refresh(flow_table, flow);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = sizeof(*ip6h) + ctx->offset;
+ if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
+ return 0;
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
flow_offload_teardown(flow);
- return NF_ACCEPT;
+ return 0;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
- return NF_DROP;
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
- if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
- return NF_DROP;
+ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
ip6h = ipv6_hdr(skb);
+ nf_flow_nat_ipv6(flow, skb, dir, ip6h);
+
ip6h->hop_limit--;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ return 1;
+}
+
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct sk_buff *skb)
+{
+ struct flow_offload_tuple tuple = {};
+
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple *other_tuple;
+ enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
+ struct in6_addr *ip6_daddr;
+ struct flow_offload *flow;
+ struct neighbour *neigh;
+ struct rt6_info *rt;
+ int ret;
+
+ tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
+ return NF_ACCEPT;
+
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip6_daddr = &other_tuple->src_v6;
- return NF_STOLEN;
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
+ return NF_DROP;
+
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
+ skb_dst_set_noref(skb, &rt->dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 62651e6683f6..d8f7bfd60ac6 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -13,12 +13,13 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct workqueue_struct *nf_flow_offload_wq;
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
struct flow_offload_work {
struct list_head list;
enum flow_cls_command cmd;
- int priority;
struct nf_flowtable *flowtable;
struct flow_offload *flow;
struct work_struct work;
@@ -33,7 +34,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
{
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
- unsigned int enc_keys;
+ unsigned long long enc_keys;
if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
return;
@@ -42,8 +43,8 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
mask->enc_key_id.keyid = 0xffffffff;
- enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
- BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+ enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL);
if (ip_tunnel_info_af(tun_info) == AF_INET) {
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
@@ -54,7 +55,7 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
mask->enc_ipv4.src = 0xffffffff;
if (key->enc_ipv4.dst)
mask->enc_ipv4.dst = 0xffffffff;
- enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
} else {
memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
@@ -63,19 +64,29 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
sizeof(struct in6_addr));
if (memcmp(&key->enc_ipv6.src, &in6addr_any,
sizeof(struct in6_addr)))
- memset(&key->enc_ipv6.src, 0xff,
+ memset(&mask->enc_ipv6.src, 0xff,
sizeof(struct in6_addr));
if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
sizeof(struct in6_addr)))
- memset(&key->enc_ipv6.dst, 0xff,
+ memset(&mask->enc_ipv6.dst, 0xff,
sizeof(struct in6_addr));
- enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
match->dissector.used_keys |= enc_keys;
}
+static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key,
+ struct flow_dissector_key_vlan *mask,
+ u16 vlan_id, __be16 proto)
+{
+ key->vlan_id = vlan_id;
+ mask->vlan_id = VLAN_VID_MASK;
+ key->vlan_tpid = proto;
+ mask->vlan_tpid = 0xffff;
+}
+
static int nf_flow_rule_match(struct nf_flow_match *match,
const struct flow_offload_tuple *tuple,
struct dst_entry *other_dst)
@@ -83,6 +94,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
struct ip_tunnel_info *tun_info;
+ bool vlan_encap = false;
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
@@ -97,9 +109,39 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
nf_flow_rule_lwt_match(match, tun_info);
}
- key->meta.ingress_ifindex = tuple->iifidx;
+ if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC)
+ key->meta.ingress_ifindex = tuple->tc.iifidx;
+ else
+ key->meta.ingress_ifindex = tuple->iifidx;
+
mask->meta.ingress_ifindex = 0xffffffff;
+ if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) &&
+ tuple->encap[0].proto == htons(ETH_P_8021Q)) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[0].id,
+ tuple->encap[0].proto);
+ vlan_encap = true;
+ }
+
+ if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) &&
+ tuple->encap[1].proto == htons(ETH_P_8021Q)) {
+ if (vlan_encap) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN,
+ cvlan);
+ nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ } else {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN,
+ vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ }
+ }
+
switch (tuple->l3proto) {
case AF_INET:
key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -121,16 +163,17 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
return -EOPNOTSUPP;
}
mask->control.addr_type = 0xffff;
- match->dissector.used_keys |= BIT(key->control.addr_type);
+ match->dissector.used_keys |= BIT_ULL(key->control.addr_type);
mask->basic.n_proto = 0xffff;
switch (tuple->l4proto) {
case IPPROTO_TCP:
key->tcp.flags = 0;
mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16);
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP);
break;
case IPPROTO_UDP:
+ case IPPROTO_GRE:
break;
default:
return -EOPNOTSUPP;
@@ -139,15 +182,22 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
key->basic.ip_proto = tuple->l4proto;
mask->basic.ip_proto = 0xff;
- key->tp.src = tuple->src_port;
- mask->tp.src = 0xffff;
- key->tp.dst = tuple->dst_port;
- mask->tp.dst = 0xffff;
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC);
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ key->tp.src = tuple->src_port;
+ mask->tp.src = 0xffff;
+ key->tp.dst = tuple->dst_port;
+ mask->tp.dst = 0xffff;
+
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ break;
+ }
- match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) |
- BIT(FLOW_DISSECTOR_KEY_CONTROL) |
- BIT(FLOW_DISSECTOR_KEY_BASIC) |
- BIT(FLOW_DISSECTOR_KEY_PORTS);
return 0;
}
@@ -175,27 +225,43 @@ static int flow_offload_eth_src(struct net *net,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- struct net_device *dev;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ struct net_device *dev = NULL;
+ const unsigned char *addr;
u32 mask, val;
u16 val16;
- dev = dev_get_by_index(net, tuple->iifidx);
- if (!dev)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ addr = this_tuple->out.h_source;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ dev = dev_get_by_index(net, other_tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ addr = dev->dev_addr;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
mask = ~0xffff0000;
- memcpy(&val16, dev->dev_addr, 2);
+ memcpy(&val16, addr, 2);
val = val16 << 16;
flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
mask = ~0xffffffff;
- memcpy(&val, dev->dev_addr + 2, 4);
+ memcpy(&val, addr + 2, 4);
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
&val, &mask);
+
dev_put(dev);
return 0;
@@ -208,27 +274,40 @@ static int flow_offload_eth_dst(struct net *net,
{
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- const void *daddr = &flow->tuplehash[!dir].tuple.src_v4;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
const struct dst_entry *dst_cache;
unsigned char ha[ETH_ALEN];
struct neighbour *n;
+ const void *daddr;
u32 mask, val;
u8 nud_state;
u16 val16;
- dst_cache = flow->tuplehash[dir].tuple.dst_cache;
- n = dst_neigh_lookup(dst_cache, daddr);
- if (!n)
- return -ENOENT;
-
- read_lock_bh(&n->lock);
- nud_state = n->nud_state;
- ether_addr_copy(ha, n->ha);
- read_unlock_bh(&n->lock);
+ this_tuple = &flow->tuplehash[dir].tuple;
- if (!(nud_state & NUD_VALID)) {
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ether_addr_copy(ha, this_tuple->out.h_dest);
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ daddr = &other_tuple->src_v4;
+ dst_cache = this_tuple->dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
neigh_release(n);
- return -ENOENT;
+
+ if (!(nud_state & NUD_VALID))
+ return -ENOENT;
+ break;
+ default:
+ return -EOPNOTSUPP;
}
mask = ~0xffffffff;
@@ -241,7 +320,6 @@ static int flow_offload_eth_dst(struct net *net,
val = val16;
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
- neigh_release(n);
return 0;
}
@@ -307,10 +385,10 @@ static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
struct flow_action_entry *entry;
int i;
- for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) {
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) {
entry = flow_action_entry_next(flow_rule);
flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
- offset + i, &addr[i], mask);
+ offset + i * sizeof(u32), &addr[i], mask);
}
}
@@ -463,27 +541,52 @@ static void flow_offload_ipv4_checksum(struct net *net,
}
}
-static void flow_offload_redirect(const struct flow_offload *flow,
+static void flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
- struct rtable *rt;
+ const struct flow_offload_tuple *this_tuple, *other_tuple;
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+ int ifindex;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ this_tuple = &flow->tuplehash[dir].tuple;
+ ifindex = this_tuple->out.ifidx;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ifindex = other_tuple->iifidx;
+ break;
+ default:
+ return;
+ }
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return;
+
+ entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = rt->dst.dev;
- dev_hold(rt->dst.dev);
+ entry->dev = dev;
}
static void flow_offload_encap_tunnel(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *this_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[dir].tuple.dst_cache;
+ this_tuple = &flow->tuplehash[dir].tuple;
+ if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = this_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -500,10 +603,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = other_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -515,10 +623,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
}
}
-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int
+nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
+ const struct flow_offload_tuple *tuple;
+ int i;
+
flow_offload_decap_tunnel(flow, dir, flow_rule);
flow_offload_encap_tunnel(flow, dir, flow_rule);
@@ -526,6 +639,53 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
+ tuple = &flow->tuplehash[dir].tuple;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ if (tuple->encap[i].proto == htons(ETH_P_8021Q)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_VLAN_POP;
+ }
+ }
+
+ other_tuple = &flow->tuplehash[!dir].tuple;
+
+ for (i = 0; i < other_tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (other_tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ entry = flow_action_entry_next(flow_rule);
+
+ switch (other_tuple->encap[i].proto) {
+ case htons(ETH_P_PPP_SES):
+ entry->id = FLOW_ACTION_PPPOE_PUSH;
+ entry->pppoe.sid = other_tuple->encap[i].id;
+ break;
+ case htons(ETH_P_8021Q):
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
+ return -1;
+
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
flow_offload_ipv4_snat(net, flow, dir, flow_rule);
flow_offload_port_snat(net, flow, dir, flow_rule);
@@ -538,21 +698,17 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
test_bit(NF_FLOW_DNAT, &flow->flags))
flow_offload_ipv4_checksum(net, flow, flow_rule);
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- flow_offload_decap_tunnel(flow, dir, flow_rule);
- flow_offload_encap_tunnel(flow, dir, flow_rule);
-
- if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
- flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
@@ -564,7 +720,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
flow_offload_port_dnat(net, flow, dir, flow_rule);
}
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -578,10 +734,10 @@ nf_flow_offload_rule_alloc(struct net *net,
enum flow_offload_tuple_dir dir)
{
const struct nf_flowtable *flowtable = offload->flowtable;
- const struct flow_offload *flow = offload->flow;
- const struct flow_offload_tuple *tuple;
+ const struct flow_offload_tuple *tuple, *other_tuple;
+ struct flow_offload *flow = offload->flow;
+ struct dst_entry *other_dst = NULL;
struct nf_flow_rule *flow_rule;
- struct dst_entry *other_dst;
int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
@@ -597,7 +753,10 @@ nf_flow_offload_rule_alloc(struct net *net,
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- other_dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ other_dst = other_tuple->dst_cache;
+
err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;
@@ -682,8 +841,8 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
+ struct netlink_ext_ack extack = {};
struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
__be16 proto = ETH_P_ALL;
int err, i = 0;
@@ -714,7 +873,8 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
return nf_flow_offload_tuple(offload->flowtable, offload->flow,
- flow_rule, dir, offload->priority,
+ flow_rule, dir,
+ offload->flowtable->priority,
FLOW_CLS_REPLACE, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -723,7 +883,8 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_DESTROY, NULL,
+ offload->flowtable->priority,
+ FLOW_CLS_DESTROY, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -734,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload,
ok_count += flow_offload_tuple_add(offload, flow_rule[0],
FLOW_OFFLOAD_DIR_ORIGINAL);
- ok_count += flow_offload_tuple_add(offload, flow_rule[1],
- FLOW_OFFLOAD_DIR_REPLY);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+ FLOW_OFFLOAD_DIR_REPLY);
if (ok_count == 0)
return -ENOENT;
@@ -753,10 +915,11 @@ static void flow_offload_work_add(struct flow_offload_work *offload)
err = flow_offload_rule_add(offload, flow_rule);
if (err < 0)
- set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags);
- else
- set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+ goto out;
+
+ set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+out:
nf_flow_offload_destroy(flow_rule);
}
@@ -764,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload)
{
clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
}
@@ -773,7 +937,8 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload,
struct flow_stats *stats)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_STATS, stats,
+ offload->flowtable->priority,
+ FLOW_CLS_STATS, stats,
&offload->flowtable->flow_block.cb_list);
}
@@ -783,11 +948,13 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
u64 lastused;
flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
- flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY,
+ &stats[1]);
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
- lastused + NF_FLOW_TIMEOUT);
+ lastused + flow_offload_get_timeout(offload->flow));
if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
if (stats[0].pkts)
@@ -804,17 +971,22 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
static void flow_offload_work_handler(struct work_struct *work)
{
struct flow_offload_work *offload;
+ struct net *net;
offload = container_of(work, struct flow_offload_work, work);
+ net = read_pnet(&offload->flowtable->net);
switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add);
break;
case FLOW_CLS_DESTROY:
flow_offload_work_del(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del);
break;
case FLOW_CLS_STATS:
flow_offload_work_stats(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats);
break;
default:
WARN_ON_ONCE(1);
@@ -826,7 +998,18 @@ static void flow_offload_work_handler(struct work_struct *work)
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- queue_work(nf_flow_offload_wq, &offload->work);
+ struct net *net = read_pnet(&offload->flowtable->net);
+
+ if (offload->cmd == FLOW_CLS_REPLACE) {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add);
+ queue_work(nf_flow_offload_add_wq, &offload->work);
+ } else if (offload->cmd == FLOW_CLS_DESTROY) {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del);
+ queue_work(nf_flow_offload_del_wq, &offload->work);
+ } else {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats);
+ queue_work(nf_flow_offload_stats_wq, &offload->work);
+ }
}
static struct flow_offload_work *
@@ -846,7 +1029,6 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
offload->cmd = cmd;
offload->flow = flow;
- offload->priority = flowtable->priority;
offload->flowtable = flowtable;
INIT_WORK(&offload->work, flow_offload_work_handler);
@@ -886,7 +1068,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
__s32 delta;
delta = nf_flow_timeout_delta(flow->timeout);
- if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10))
+ if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10))
return;
offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS);
@@ -896,10 +1078,21 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
flow_offload_queue_work(offload);
}
+void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_del_wq);
+ nf_flow_table_gc_run(flowtable);
+ }
+}
+
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
- if (nf_flowtable_hw_offload(flowtable))
- flush_workqueue(nf_flow_offload_wq);
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_add_wq);
+ flush_workqueue(nf_flow_offload_del_wq);
+ flush_workqueue(nf_flow_offload_stats_wq);
+ }
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -909,6 +1102,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
struct flow_block_cb *block_cb, *next;
int err = 0;
+ down_write(&flowtable->flow_block_lock);
switch (cmd) {
case FLOW_BLOCK_BIND:
list_splice(&bo->cb_list, &flowtable->flow_block.cb_list);
@@ -923,6 +1117,7 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
WARN_ON_ONCE(1);
err = -EOPNOTSUPP;
}
+ up_write(&flowtable->flow_block_lock);
return err;
}
@@ -939,6 +1134,7 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
+ bo->cb_list_head = &flowtable->flow_block.cb_list;
INIT_LIST_HEAD(&bo->cb_list);
}
@@ -950,6 +1146,7 @@ static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb)
nf_flow_table_gc_cleanup(flowtable, dev);
down_write(&flowtable->flow_block_lock);
list_del(&block_cb->list);
+ list_del(&block_cb->driver_list);
flow_block_cb_free(block_cb);
up_write(&flowtable->flow_block_lock);
}
@@ -963,7 +1160,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
extack);
- return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo,
+ return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo,
nf_flow_table_indr_cleanup);
}
@@ -977,7 +1174,9 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
extack);
+ down_write(&flowtable->flow_block_lock);
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
+ up_write(&flowtable->flow_block_lock);
if (err < 0)
return err;
@@ -993,7 +1192,7 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
int err;
if (!nf_flowtable_hw_offload(flowtable))
- return 0;
+ return nf_flow_offload_xdp_setup(flowtable, dev, cmd);
if (dev->netdev_ops->ndo_setup_tc)
err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
@@ -1010,15 +1209,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
int nf_flow_table_offload_init(void)
{
- nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload",
- WQ_UNBOUND, 0);
- if (!nf_flow_offload_wq)
+ nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_add_wq)
return -ENOMEM;
+ nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_del_wq)
+ goto err_del_wq;
+
+ nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_stats_wq)
+ goto err_stats_wq;
+
return 0;
+
+err_stats_wq:
+ destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+ destroy_workqueue(nf_flow_offload_add_wq);
+ return -ENOMEM;
}
void nf_flow_table_offload_exit(void)
{
- destroy_workqueue(nf_flow_offload_wq);
+ destroy_workqueue(nf_flow_offload_add_wq);
+ destroy_workqueue(nf_flow_offload_del_wq);
+ destroy_workqueue(nf_flow_offload_stats_wq);
}
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
new file mode 100644
index 000000000000..f0984cf69a09
--- /dev/null
+++ b/net/netfilter/nf_flow_table_path.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h>
+#include <net/inet_dscp.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nft_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ struct flow_offload_tunnel tun;
+ u8 num_tuns;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ case DEV_PATH_TUN:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
+ if (path->type == DEV_PATH_TUN) {
+ if (info->num_tuns) {
+ info->indev = NULL;
+ break;
+ }
+ info->tun.src_v6 = path->tun.src_v6;
+ info->tun.dst_v6 = path->tun.dst_v6;
+ info->tun.l3_proto = path->tun.l3_proto;
+ info->num_tuns++;
+ } else {
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id =
+ path->encap.id;
+ info->encap[info->num_encaps].proto =
+ path->encap.proto;
+ info->num_encaps++;
+ }
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ if (WARN_ON_ONCE(info->num_encaps-- == 0)) {
+ info->indev = NULL;
+ break;
+ }
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ info->outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (!nft_hook_find_ops_rcu(hook, dev))
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
+ struct flow_offload_tunnel *tun,
+ struct nf_flow_route *route,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *cur_dst = route->tuple[dir].dst;
+ struct dst_entry *tun_dst = NULL;
+ struct flowi fl = {};
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = tun->dst_v4.s_addr;
+ fl.u.ip4.saddr = tun->src_v4.s_addr;
+ fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = tun->dst_v6;
+ fl.u.ip6.saddr = tun->src_v6;
+ fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt));
+ if (!tun_dst)
+ return -ENOENT;
+
+ route->tuple[dir].dst = tun_dst;
+ dst_release(cur_dst);
+
+ return 0;
+}
+
+static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
+ struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+
+ if (info.num_tuns &&
+ !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) {
+ route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
+ route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
+ route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
+ route->tuple[!dir].in.num_tuns = info.num_tuns;
+ }
+
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ struct dst_entry *this_dst = skb_dst(pkt->skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+ fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
+ fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ if (!dst_hold_safe(this_dst))
+ return -ENOENT;
+
+ nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+ if (!other_dst) {
+ dst_release(this_dst);
+ return -ENOENT;
+ }
+
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, dir, ft);
+ if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, !dir, ft);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_flow_route);
diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c
new file mode 100644
index 000000000000..159b033a43e6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_procfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+ (*pos)++;
+ return NULL;
+}
+
+static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_flow_table_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "wq_add wq_del wq_stats\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%8d %8d %8d\n",
+ st->count_wq_add,
+ st->count_wq_del,
+ st->count_wq_stats
+ );
+ return 0;
+}
+
+static const struct seq_operations nf_flow_table_cpu_seq_ops = {
+ .start = nf_flow_table_cpu_seq_start,
+ .next = nf_flow_table_cpu_seq_next,
+ .stop = nf_flow_table_cpu_seq_stop,
+ .show = nf_flow_table_cpu_seq_show,
+};
+
+int nf_flow_table_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat,
+ &nf_flow_table_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+ return pde ? 0 : -ENOMEM;
+}
+
+void nf_flow_table_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_flowtable", net->proc_net_stat);
+}
diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c
new file mode 100644
index 000000000000..e1252d042699
--- /dev/null
+++ b/net/netfilter/nf_flow_table_xdp.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct flow_offload_xdp_ft {
+ struct list_head head;
+ struct nf_flowtable *ft;
+ struct rcu_head rcuhead;
+};
+
+struct flow_offload_xdp {
+ struct hlist_node hnode;
+ unsigned long net_device_addr;
+ struct list_head head;
+};
+
+#define NF_XDP_HT_BITS 4
+static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS);
+static DEFINE_MUTEX(nf_xdp_hashtable_lock);
+
+/* caller must hold rcu read lock */
+struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev)
+{
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp *iter;
+
+ hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ struct flow_offload_xdp_ft *ft_elem;
+
+ /* The user is supposed to insert a given net_device
+ * just into a single nf_flowtable so we always return
+ * the first element here.
+ */
+ ft_elem = list_first_or_null_rcu(&iter->head,
+ struct flow_offload_xdp_ft,
+ head);
+ return ft_elem ? ft_elem->ft : NULL;
+ }
+ }
+
+ return NULL;
+}
+
+static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp_ft *ft_elem;
+
+ ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT);
+ if (!ft_elem)
+ return -ENOMEM;
+
+ ft_elem->ft = ft;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (!elem) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT);
+ if (!elem)
+ goto err_unlock;
+
+ elem->net_device_addr = key;
+ INIT_LIST_HEAD(&elem->head);
+ hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key);
+ }
+ list_add_tail_rcu(&ft_elem->head, &elem->head);
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ return 0;
+
+err_unlock:
+ mutex_unlock(&nf_xdp_hashtable_lock);
+ kfree(ft_elem);
+
+ return -ENOMEM;
+}
+
+static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (elem) {
+ struct flow_offload_xdp_ft *ft_elem, *ft_next;
+
+ list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) {
+ if (ft_elem->ft == ft) {
+ list_del_rcu(&ft_elem->head);
+ kfree_rcu(ft_elem, rcuhead);
+ }
+ }
+
+ if (list_empty(&elem->head))
+ hash_del_rcu(&elem->hnode);
+ else
+ elem = NULL;
+ }
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ if (elem) {
+ synchronize_rcu();
+ kfree(elem);
+ }
+}
+
+int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ return nf_flowtable_by_dev_insert(flowtable, dev);
+ case FLOW_BLOCK_UNBIND:
+ nf_flowtable_by_dev_remove(flowtable, dev);
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
+ return 0;
+}
diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c
new file mode 100644
index 000000000000..2d890dd04ff8
--- /dev/null
+++ b/net/netfilter/nf_hooks_lwtunnel.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sysctl.h>
+#include <net/lwtunnel.h>
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+#include <linux/netfilter.h>
+
+#include "nf_internals.h"
+
+static inline int nf_hooks_lwtunnel_get(void)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return 1;
+ else
+ return 0;
+}
+
+static inline int nf_hooks_lwtunnel_set(int enable)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) {
+ if (!enable)
+ return -EBUSY;
+ } else if (enable) {
+ static_branch_enable(&nf_hooks_lwtunnel_enabled);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int proc_nf_hooks_lwtunnel_enabled = 0;
+ struct ctl_table tmp = {
+ .procname = table->procname,
+ .data = &proc_nf_hooks_lwtunnel_enabled,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+ int ret;
+
+ if (!write)
+ proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get();
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0)
+ ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler);
+
+static struct ctl_table nf_lwtunnel_sysctl_table[] = {
+ {
+ .procname = "nf_hooks_lwtunnel",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = nf_hooks_lwtunnel_sysctl_handler,
+ },
+};
+
+static int __net_init nf_lwtunnel_net_init(struct net *net)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = nf_lwtunnel_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_lwtunnel_sysctl_table,
+ sizeof(nf_lwtunnel_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ hdr = register_net_sysctl_sz(net, "net/netfilter", table,
+ ARRAY_SIZE(nf_lwtunnel_sysctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->nf.nf_lwtnl_dir_header = hdr;
+
+ return 0;
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit nf_lwtunnel_net_exit(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->nf.nf_lwtnl_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations nf_lwtunnel_net_ops = {
+ .init = nf_lwtunnel_net_init,
+ .exit = nf_lwtunnel_net_exit,
+};
+
+int __init netfilter_lwtunnel_init(void)
+{
+ return register_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+
+void netfilter_lwtunnel_fini(void)
+{
+ unregister_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+#else
+int __init netfilter_lwtunnel_init(void) { return 0; }
+void netfilter_lwtunnel_fini(void) {}
+#endif /* CONFIG_SYSCTL */
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 832ae64179f0..25403023060b 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -29,6 +29,12 @@ void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
int __init netfilter_log_init(void);
+#ifdef CONFIG_LWTUNNEL
+/* nf_hooks_lwtunnel.c */
+int __init netfilter_lwtunnel_init(void);
+void netfilter_lwtunnel_fini(void);
+#endif
+
/* core.c */
void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
const struct nf_hook_ops *reg);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 6cb9f9474b05..74cef8bf554c 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -31,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
int i;
for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
- if (loggers[pf][i] == NULL)
+ log = nft_log_dereference(loggers[pf][i]);
+ if (!log)
continue;
- log = nft_log_dereference(loggers[pf][i]);
if (!strncasecmp(str_logger, log->name, strlen(log->name)))
return log;
}
@@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger)
}
EXPORT_SYMBOL(nf_log_unregister);
+/**
+ * nf_log_is_registered - Check if any logger is registered for a given
+ * protocol family.
+ *
+ * @pf: Protocol family
+ *
+ * Returns: true if at least one logger is active for @pf, false otherwise.
+ */
+bool nf_log_is_registered(u_int8_t pf)
+{
+ int i;
+
+ if (pf >= NFPROTO_NUMPROTO) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+ if (rcu_access_pointer(loggers[pf][i]))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(nf_log_is_registered);
+
int nf_log_bind_pf(struct net *net, u_int8_t pf,
const struct nf_logger *logger)
{
@@ -151,18 +177,16 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf)
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_logger_request_module(int pf, enum nf_log_type type)
-{
- if (loggers[pf][type] == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-}
-EXPORT_SYMBOL_GPL(nf_logger_request_module);
-
int nf_logger_find_get(int pf, enum nf_log_type type)
{
struct nf_logger *logger;
int ret = -ENOENT;
+ if (pf >= ARRAY_SIZE(loggers))
+ return -EINVAL;
+ if (type >= NF_LOG_TYPE_MAX)
+ return -EINVAL;
+
if (pf == NFPROTO_INET) {
ret = nf_logger_find_get(NFPROTO_IPV4, type);
if (ret < 0)
@@ -177,9 +201,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
return 0;
}
- if (rcu_access_pointer(loggers[pf][type]) == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
if (logger == NULL)
@@ -203,11 +224,12 @@ void nf_logger_put(int pf, enum nf_log_type type)
return;
}
- BUG_ON(loggers[pf][type] == NULL);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
- module_put(logger->me);
+ if (!logger)
+ WARN_ON_ONCE(1);
+ else
+ module_put(logger->me);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_logger_put);
@@ -399,7 +421,7 @@ static const struct seq_operations nflog_seq_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
-static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO];
static struct ctl_table_header *nf_log_sysctl_fhdr;
static struct ctl_table nf_log_sysctl_ftable[] = {
@@ -410,10 +432,9 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
-static int nf_log_proc_dostring(struct ctl_table *table, int write,
+static int nf_log_proc_dostring(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
@@ -453,9 +474,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
mutex_lock(&nf_log_mutex);
logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
if (!logger)
- strlcpy(buf, "NONE", sizeof(buf));
+ strscpy(buf, "NONE", sizeof(buf));
else
- strlcpy(buf, logger->name, sizeof(buf));
+ strscpy(buf, logger->name, sizeof(buf));
mutex_unlock(&nf_log_mutex);
r = proc_dostring(&tmp, write, buffer, lenp, ppos);
}
@@ -497,9 +518,10 @@ static int netfilter_log_sysctl_init(struct net *net)
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
table[i].extra2 = net;
- net->nf.nf_log_dir_header = register_net_sysctl(net,
- "net/netfilter/nf_log",
- table);
+ net->nf.nf_log_dir_header = register_net_sysctl_sz(net,
+ "net/netfilter/nf_log",
+ table,
+ ARRAY_SIZE(nf_log_sysctl_table));
if (!net->nf.nf_log_dir_header)
goto err_reg;
@@ -517,7 +539,7 @@ err_alloc:
static void netfilter_log_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->nf.nf_log_dir_header->ctl_table_arg;
unregister_net_sysctl_table(net->nf.nf_log_dir_header);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
deleted file mode 100644
index ae5628ddbe6d..000000000000
--- a/net/netfilter/nf_log_common.c
+++ /dev/null
@@ -1,212 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset)
-{
- struct udphdr _udph;
- const struct udphdr *uh;
-
- if (proto == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- nf_log_buf_add(m, "PROTO=UDP ");
- else /* Max length: 14 "PROTO=UDPLITE " */
- nf_log_buf_add(m, "PROTO=UDPLITE ");
-
- if (fragment)
- goto out;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
- if (uh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
-
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
-
-out:
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_udp_header);
-
-int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset,
- unsigned int logflags)
-{
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* Max length: 10 "PROTO=TCP " */
- nf_log_buf_add(m, "PROTO=TCP ");
-
- if (fragment)
- return 0;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
- if (th == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & NF_LOG_TCPSEQ) {
- nf_log_buf_add(m, "SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- }
-
- /* Max length: 13 "WINDOW=65535 " */
- nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3C " */
- nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
- TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- nf_log_buf_add(m, "CWR ");
- if (th->ece)
- nf_log_buf_add(m, "ECE ");
- if (th->urg)
- nf_log_buf_add(m, "URG ");
- if (th->ack)
- nf_log_buf_add(m, "ACK ");
- if (th->psh)
- nf_log_buf_add(m, "PSH ");
- if (th->rst)
- nf_log_buf_add(m, "RST ");
- if (th->syn)
- nf_log_buf_add(m, "SYN ");
- if (th->fin)
- nf_log_buf_add(m, "FIN ");
- /* Max length: 11 "URGP=65535 " */
- nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
- u_int8_t _opt[60 - sizeof(struct tcphdr)];
- const u_int8_t *op;
- unsigned int i;
- unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
-
- op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
- optsize, _opt);
- if (op == NULL) {
- nf_log_buf_add(m, "OPT (TRUNCATED)");
- return 1;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- nf_log_buf_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- nf_log_buf_add(m, "%02X", op[i]);
-
- nf_log_buf_add(m, ") ");
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
-
-void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
- struct sock *sk)
-{
- if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
- return;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket && sk->sk_socket->file) {
- const struct cred *cred = sk->sk_socket->file->f_cred;
- nf_log_buf_add(m, "UID=%u GID=%u ",
- from_kuid_munged(&init_user_ns, cred->fsuid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
- }
- read_unlock_bh(&sk->sk_callback_lock);
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid);
-
-void
-nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo, const char *prefix)
-{
- const struct net_device *physoutdev __maybe_unused;
- const struct net_device *physindev __maybe_unused;
-
- nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
- '0' + loginfo->u.log.level, prefix,
- in ? in->name : "",
- out ? out->name : "");
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- physindev = nf_bridge_get_physindev(skb);
- if (physindev && in != physindev)
- nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = nf_bridge_get_physoutdev(skb);
- if (physoutdev && out != physoutdev)
- nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
-#endif
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
-
-/* bridge and netdev logging families share this code. */
-void nf_log_l2packet(struct net *net, u_int8_t pf,
- __be16 protocol,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- switch (protocol) {
- case htons(ETH_P_IP):
- nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_IPV6):
- nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_ARP):
- case htons(ETH_P_RARP):
- nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- }
-}
-EXPORT_SYMBOL_GPL(nf_log_l2packet);
-
-static int __init nf_log_common_init(void)
-{
- return 0;
-}
-
-static void __exit nf_log_common_exit(void) {}
-
-module_init(nf_log_common_init);
-module_exit(nf_log_common_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
deleted file mode 100644
index 968dafa684c9..000000000000
--- a/net/netfilter/nf_log_netdev.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_log.h>
-
-static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
- loginfo, prefix);
-}
-
-static struct nf_logger nf_netdev_logger __read_mostly = {
- .name = "nf_log_netdev",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_netdev_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_netdev_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
-}
-
-static void __net_exit nf_log_netdev_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_netdev_logger);
-}
-
-static struct pernet_operations nf_log_netdev_net_ops = {
- .init = nf_log_netdev_net_init,
- .exit = nf_log_netdev_net_exit,
-};
-
-static int __init nf_log_netdev_init(void)
-{
- int ret;
-
- /* Request to load the real packet loggers. */
- nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
-
- ret = register_pernet_subsys(&nf_log_netdev_net_ops);
- if (ret < 0)
- return ret;
-
- nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
- return 0;
-}
-
-static void __exit nf_log_netdev_exit(void)
-{
- unregister_pernet_subsys(&nf_log_netdev_net_ops);
- nf_log_unregister(&nf_netdev_logger);
-}
-
-module_init(nf_log_netdev_init);
-module_exit(nf_log_netdev_exit);
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter netdev packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
new file mode 100644
index 000000000000..86d5fc5d28e3
--- /dev/null
+++ b/net/netfilter/nf_log_syslog.c
@@ -0,0 +1,1085 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static const struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_DEFAULT_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+/* Guard against containers flooding syslog. */
+static bool nf_log_allowed(const struct net *net)
+{
+ return net_eq(net, &init_net) || sysctl_nf_log_all_netns;
+}
+
+static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
+{
+ u16 vid;
+
+ if (!skb_vlan_tag_present(skb))
+ return;
+
+ vid = skb_vlan_tag_get(skb);
+ nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid);
+}
+static void noinline_for_stack
+dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+ const struct arphdr *ah;
+ unsigned int logflags;
+ struct arphdr _arph;
+
+ ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph);
+ if (!ah) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ if (logflags & NF_LOG_MACDECODE) {
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ }
+
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (!ap) {
+ nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+static void
+nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo, const char *prefix,
+ struct net *net)
+{
+ const struct net_device *physoutdev __maybe_unused;
+ const struct net_device *physindev __maybe_unused;
+
+ nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ physindev = nf_bridge_get_physindev(skb, net);
+ if (physindev && in != physindev)
+ nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev && out != physoutdev)
+ nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
+#endif
+}
+
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix, net);
+ dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
+ struct sock *sk)
+{
+ if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
+ return;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ const struct cred *cred = sk->sk_socket->file->f_cred;
+
+ nf_log_buf_add(m, "UID=%u GID=%u ",
+ from_kuid_munged(&init_user_ns, cred->fsuid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static noinline_for_stack int
+nf_log_dump_tcp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset,
+ unsigned int logflags)
+{
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ nf_log_buf_add(m, "PROTO=TCP ");
+
+ if (fragment)
+ return 0;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!th) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u ",
+ ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (logflags & NF_LOG_TCPSEQ) {
+ nf_log_buf_add(m, "SEQ=%u ACK=%u ",
+ ntohl(th->seq), ntohl(th->ack_seq));
+ }
+
+ /* Max length: 13 "WINDOW=65535 " */
+ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3C " */
+ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+ TCP_RESERVED_BITS) >> 22));
+ /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->ae)
+ nf_log_buf_add(m, "AE ");
+ if (th->cwr)
+ nf_log_buf_add(m, "CWR ");
+ if (th->ece)
+ nf_log_buf_add(m, "ECE ");
+ if (th->urg)
+ nf_log_buf_add(m, "URG ");
+ if (th->ack)
+ nf_log_buf_add(m, "ACK ");
+ if (th->psh)
+ nf_log_buf_add(m, "PSH ");
+ if (th->rst)
+ nf_log_buf_add(m, "RST ");
+ if (th->syn)
+ nf_log_buf_add(m, "SYN ");
+ if (th->fin)
+ nf_log_buf_add(m, "FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) {
+ unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr);
+ u8 _opt[60 - sizeof(struct tcphdr)];
+ unsigned int i;
+ const u8 *op;
+
+ op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "OPT (TRUNCATED)");
+ return 1;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+
+ nf_log_buf_add(m, ") ");
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+nf_log_dump_udp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset)
+{
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ if (proto == IPPROTO_UDP)
+ /* Max length: 10 "PROTO=UDP " */
+ nf_log_buf_add(m, "PROTO=UDP ");
+ else /* Max length: 14 "PROTO=UDPLITE " */
+ nf_log_buf_add(m, "PROTO=UDPLITE ");
+
+ if (fragment)
+ goto out;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!uh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
+ ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
+
+out:
+ return 0;
+}
+
+/* One level of recursion won't kill us */
+static noinline_for_stack void
+dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ const struct iphdr *ih;
+ unsigned int logflags;
+ struct iphdr _iph;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options.
+ * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 "
+ */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & NF_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ const unsigned char *op;
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff + sizeof(_iph),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ static const size_t required_len[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT] = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+ const struct icmphdr *ich;
+ struct icmphdr _icmph;
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (!ich) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ fallthrough;
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(net, m, info, skb,
+ iphoff + ih->ihl * 4 + sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ const struct ip_auth_hdr *ah;
+ struct ip_auth_hdr _ahdr;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_ahdr), &_ahdr);
+ if (!ah) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ const struct ip_esp_hdr *eh;
+ struct ip_esp_hdr _esph;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_esph), &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 255 = 806 */
+}
+
+static noinline_for_stack void
+dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int ip6hoff,
+ int recurse)
+{
+ const struct ipv6hdr *ih;
+ unsigned int hdrlen = 0;
+ unsigned int logflags;
+ struct ipv6hdr _ip6h;
+ unsigned int ptr;
+ u8 currenthdr;
+ int fragment;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ ih->hop_limit,
+ (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+ fragment = 0;
+ ptr = ip6hoff + sizeof(struct ipv6hdr);
+ currenthdr = ih->nexthdr;
+ while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) {
+ struct ipv6_opt_hdr _hdr;
+ const struct ipv6_opt_hdr *hp;
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ if (!hp) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 48 "OPT (...) " */
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, "OPT ( ");
+
+ switch (currenthdr) {
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr _fhdr;
+ const struct frag_hdr *fh;
+
+ nf_log_buf_add(m, "FRAG:");
+ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+ &_fhdr);
+ if (!fh) {
+ nf_log_buf_add(m, "TRUNCATED ");
+ return;
+ }
+
+ /* Max length: 6 "65535 " */
+ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+ /* Max length: 11 "INCOMPLETE " */
+ if (fh->frag_off & htons(0x0001))
+ nf_log_buf_add(m, "INCOMPLETE ");
+
+ nf_log_buf_add(m, "ID:%08x ",
+ ntohl(fh->identification));
+
+ if (ntohs(fh->frag_off) & 0xFFF8)
+ fragment = 1;
+
+ hdrlen = 8;
+ break;
+ }
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_HOPOPTS:
+ if (fragment) {
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ")");
+ return;
+ }
+ hdrlen = ipv6_optlen(hp);
+ break;
+ /* Max Length */
+ case IPPROTO_AH:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ /* Max length: 3 "AH " */
+ nf_log_buf_add(m, "AH ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+ &_ahdr);
+ if (!ah) {
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ }
+
+ hdrlen = ipv6_authlen(hp);
+ break;
+ case IPPROTO_ESP:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 4 "ESP " */
+ nf_log_buf_add(m, "ESP ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+ &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 16 "SPI=0xF1234567 )" */
+ nf_log_buf_add(m, "SPI=0x%x )",
+ ntohl(eh->spi));
+ }
+ return;
+ default:
+ /* Max length: 20 "Unknown Ext Hdr 255" */
+ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
+ return;
+ }
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ") ");
+
+ currenthdr = hp->nexthdr;
+ ptr += hdrlen;
+ }
+
+ switch (currenthdr) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
+ ptr, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
+ return;
+ break;
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _icmp6h;
+ const struct icmp6hdr *ic;
+
+ /* Max length: 13 "PROTO=ICMPv6 " */
+ nf_log_buf_add(m, "PROTO=ICMPv6 ");
+
+ if (fragment)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+ if (!ic) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ",
+ ic->icmp6_type, ic->icmp6_code);
+
+ switch (ic->icmp6_type) {
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ic->icmp6_identifier),
+ ntohs(ic->icmp6_sequence));
+ break;
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ break;
+
+ case ICMPV6_PARAMPROB:
+ /* Max length: 17 "POINTER=ffffffff " */
+ nf_log_buf_add(m, "POINTER=%08x ",
+ ntohl(ic->icmp6_pointer));
+ fallthrough;
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_PKT_TOOBIG:
+ case ICMPV6_TIME_EXCEED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ nf_log_buf_add(m, "[");
+ dump_ipv6_packet(net, m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohl(ic->icmp6_mtu));
+ }
+ }
+ break;
+ }
+ /* Max length: 10 "PROTO=255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", currenthdr);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && recurse)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (recurse && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & NF_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p) {
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++)
+ nf_log_buf_add(m, ":%02x", *p++);
+ }
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+
+ nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr,
+ &iph->daddr);
+ }
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix, net);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
+ loginfo, prefix, net);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip6_logger __read_mostly = {
+ .name = "nf_log_ipv6",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip6_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix, net);
+
+ dump_mac_header(m, loginfo, skb);
+
+ nf_log_buf_close(m);
+}
+
+static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ default:
+ nf_log_unknown_packet(net, pf, hooknum, skb,
+ in, out, loginfo, prefix);
+ break;
+ }
+}
+
+static struct nf_logger nf_netdev_logger __read_mostly = {
+ .name = "nf_log_netdev",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static struct nf_logger nf_bridge_logger __read_mostly = {
+ .name = "nf_log_bridge",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_syslog_net_init(struct net *net)
+{
+ int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+
+ if (ret)
+ return ret;
+
+ ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ if (ret)
+ goto err1;
+
+ ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret)
+ goto err2;
+
+ ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret)
+ goto err3;
+
+ ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret)
+ goto err4;
+ return 0;
+err4:
+ nf_log_unset(net, &nf_netdev_logger);
+err3:
+ nf_log_unset(net, &nf_ip6_logger);
+err2:
+ nf_log_unset(net, &nf_arp_logger);
+err1:
+ nf_log_unset(net, &nf_ip_logger);
+ return ret;
+}
+
+static void __net_exit nf_log_syslog_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+ nf_log_unset(net, &nf_arp_logger);
+ nf_log_unset(net, &nf_ip6_logger);
+ nf_log_unset(net, &nf_netdev_logger);
+ nf_log_unset(net, &nf_bridge_logger);
+}
+
+static struct pernet_operations nf_log_syslog_net_ops = {
+ .init = nf_log_syslog_net_init,
+ .exit = nf_log_syslog_net_exit,
+};
+
+static int __init nf_log_syslog_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_syslog_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0)
+ goto err1;
+
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0)
+ goto err2;
+
+ ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret < 0)
+ goto err3;
+
+ ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret < 0)
+ goto err4;
+
+ ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret < 0)
+ goto err5;
+
+ return 0;
+err5:
+ nf_log_unregister(&nf_netdev_logger);
+err4:
+ nf_log_unregister(&nf_ip6_logger);
+err3:
+ nf_log_unregister(&nf_arp_logger);
+err2:
+ nf_log_unregister(&nf_ip_logger);
+err1:
+ pr_err("failed to register logger\n");
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_syslog_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+ nf_log_unregister(&nf_arp_logger);
+ nf_log_unregister(&nf_ip6_logger);
+ nf_log_unregister(&nf_netdev_logger);
+ nf_log_unregister(&nf_bridge_logger);
+}
+
+module_init(nf_log_syslog_init);
+module_exit(nf_log_syslog_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter syslog packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf_log_arp");
+MODULE_ALIAS("nf_log_bridge");
+MODULE_ALIAS("nf_log_ipv4");
+MODULE_ALIAS("nf_log_ipv6");
+MODULE_ALIAS("nf_log_netdev");
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0);
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
+MODULE_ALIAS_NF_LOGGER(3, 0);
+MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 3bc7e0854efe..98deef6cde69 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -44,19 +44,7 @@ static unsigned int help(struct sk_buff *skb,
exp->expectfn = nf_nat_follow_master;
/* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int res;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- res = nf_ct_expect_related(exp, 0);
- if (res == 0)
- break;
- else if (res != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
new file mode 100644
index 000000000000..481be15609b1
--- /dev/null
+++ b/net/netfilter/nf_nat_bpf.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable NAT Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+
+__bpf_kfunc_start_defs();
+
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr - Nat source/destination address
+ * @port - Nat source/destination port. Non-positive values are
+ * interpreted as select a random port.
+ * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ u16 proto = nf_ct_l3num(ct);
+ struct nf_nat_range2 range;
+
+ if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+ return -EINVAL;
+
+ memset(&range, 0, sizeof(struct nf_nat_range2));
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = *addr;
+ range.max_addr = range.min_addr;
+ if (port > 0) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = cpu_to_be16(port);
+ range.max_proto.all = range.min_proto.all;
+ }
+
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_nat_kfunc_set)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(nf_nat_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_nat_kfunc_set,
+};
+
+int register_nf_nat_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_bpf_nat_kfunc_set);
+ if (ret)
+ return ret;
+
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &nf_bpf_nat_kfunc_set);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index bfc555fcbc72..78a61dac4ade 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -13,10 +13,10 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/xfrm.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/rtnetlink.h>
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
@@ -27,6 +27,9 @@
#include "nf_internals.h"
+#define NF_NAT_MAX_ATTEMPTS 128
+#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)
+
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
@@ -34,7 +37,7 @@ static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
-static unsigned int nf_nat_hash_rnd __read_mostly;
+static siphash_aligned_key_t nf_nat_hash_rnd;
struct nf_nat_lookup_hook_priv {
struct nf_hook_entries __rcu *entries;
@@ -66,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl4->fl4_dport = t->dst.u.all;
}
@@ -78,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl4->fl4_sport = t->src.u.all;
}
@@ -99,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl6->fl6_dport = t->dst.u.all;
}
@@ -111,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
if (t->dst.protonum == IPPROTO_TCP ||
t->dst.protonum == IPPROTO_UDP ||
t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
t->dst.protonum == IPPROTO_SCTP)
fl6->fl6_sport = t->src.u.all;
}
@@ -146,61 +145,69 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
return;
}
}
-
-int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
-{
- struct flowi fl;
- unsigned int hh_len;
- struct dst_entry *dst;
- struct sock *sk = skb->sk;
- int err;
-
- err = xfrm_decode_session(skb, &fl, family);
- if (err < 0)
- return err;
-
- dst = skb_dst(skb);
- if (dst->xfrm)
- dst = ((struct xfrm_dst *)dst)->route;
- if (!dst_hold_safe(dst))
- return -EHOSTUNREACH;
-
- if (sk && !net_eq(net, sock_net(sk)))
- sk = NULL;
-
- dst = xfrm_lookup(net, dst, &fl, sk, 0);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
-
- /* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
- if (skb_headroom(skb) < hh_len &&
- pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -ENOMEM;
- return 0;
-}
-EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
/* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
-hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net,
+ const struct nf_conntrack_zone *zone,
+ const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ struct {
+ struct nf_conntrack_man src;
+ u32 net_mix;
+ u32 protonum;
+ u32 zone;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+ memset(&combined, 0, sizeof(combined));
+
/* Original src, to ensure we map it consistently if poss. */
- hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
+ combined.src = tuple->src;
+ combined.net_mix = net_hash_mix(net);
+ combined.protonum = tuple->dst.protonum;
+
+ /* Zone ID can be used provided its valid for both directions */
+ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
+ combined.zone = zone->id;
+
+ hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
return reciprocal_scale(hash, nf_nat_htable_size);
}
-/* Is this tuple already taken? (not by us) */
+/**
+ * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_conntrack: our (unconfirmed) conntrack entry
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple in either direction.
+ *
+ * Example:
+ * INITIATOR -> NAT/PAT -> RESPONDER
+ *
+ * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite).
+ * Then, later, NAT/PAT itself also connects to RESPONDER.
+ *
+ * This will not work if the SNAT done earlier has same IP:PORT source pair.
+ *
+ * Conntrack table has:
+ * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * and new locally originating connection wants:
+ * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * ... which would mean incoming packets cannot be distinguished between
+ * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple).
+ *
+ * @return: true if the proposed NAT mapping collides with an existing entry.
+ */
static int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
@@ -217,6 +224,182 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}
+static bool nf_nat_allow_clash(const struct nf_conn *ct)
+{
+ return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash;
+}
+
+/**
+ * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_ct: our (unconfirmed) conntrack entry
+ *
+ * Same as nf_nat_used_tuple, but also check for rare clash in reverse
+ * direction. Should be called only when @tuple has not been altered, i.e.
+ * @ignored_conntrack will not be subject to NAT.
+ *
+ * @return: true if the proposed NAT mapping collides with existing entry.
+ */
+static noinline bool
+nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_ct)
+{
+ static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST;
+ const struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ if (!nf_nat_used_tuple(tuple, ignored_ct))
+ return false;
+
+ if (!nf_nat_allow_clash(ignored_ct))
+ return true;
+
+ /* Initial choice clashes with existing conntrack.
+ * Check for (rare) reverse collision.
+ *
+ * This can happen when new packets are received in both directions
+ * at the exact same time on different CPUs.
+ *
+ * Without SMP, first packet creates new conntrack entry and second
+ * packet is resolved as established reply packet.
+ *
+ * With parallel processing, both packets could be picked up as
+ * new and both get their own ct entry allocated.
+ *
+ * If ignored_conntrack and colliding ct are not subject to NAT then
+ * pretend the tuple is available and let later clash resolution
+ * handle this at insertion time.
+ *
+ * Without it, the 'reply' packet has its source port rewritten
+ * by nat engine.
+ */
+ if (READ_ONCE(ignored_ct->status) & uses_nat)
+ return true;
+
+ net = nf_ct_net(ignored_ct);
+ zone = nf_ct_zone(ignored_ct);
+
+ thash = nf_conntrack_find_get(net, zone, tuple);
+ if (unlikely(!thash)) {
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuple(&reply, tuple);
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash) /* clashing entry went away */
+ return false;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ /* NB: IP_CT_DIR_ORIGINAL should be impossible because
+ * nf_nat_used_tuple() handles origin collisions.
+ *
+ * Handle remote chance other CPU confirmed its ct right after.
+ */
+ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY)
+ goto out;
+
+ /* clashing connection subject to NAT? Retry with new tuple. */
+ if (READ_ONCE(ct->status) & uses_nat)
+ goto out;
+
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+ taken = false;
+ goto out;
+ }
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
+static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
+{
+ static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
+ IPS_DYING;
+ static const unsigned long flags_needed = IPS_SRC_NAT;
+ enum tcp_conntrack old_state;
+
+ old_state = READ_ONCE(ct->proto.tcp.state);
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ return false;
+
+ if (flags & flags_refuse)
+ return false;
+
+ return (flags & flags_needed) == flags_needed;
+}
+
+/* reverse direction will send packets to new source, so
+ * make sure such packets are invalid.
+ */
+static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
+{
+ return (__s32)(new->proto.tcp.seen[0].td_end -
+ old->proto.tcp.seen[0].td_end) > 0;
+}
+
+static int
+nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack,
+ unsigned int attempts_left)
+{
+ static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
+ struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple reply;
+ unsigned long flags;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ nf_ct_invert_tuple(&reply, tuple);
+
+ if (attempts_left > NF_NAT_HARDER_THRESH ||
+ tuple->dst.protonum != IPPROTO_TCP ||
+ ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+
+ /* :ast few attempts to find a free tcp port. Destructive
+ * action: evict colliding if its in timewait state and the
+ * tcp sequence number has advanced past the one used by the
+ * old entry.
+ */
+ net = nf_ct_net(ignored_conntrack);
+ zone = nf_ct_zone(ignored_conntrack);
+
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash)
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ if (WARN_ON_ONCE(ct == ignored_conntrack))
+ goto out;
+
+ flags = READ_ONCE(ct->status);
+ if (!nf_nat_may_kill(ct, flags))
+ goto out;
+
+ if (!nf_seq_has_advanced(ct, ignored_conntrack))
+ goto out;
+
+ /* Even if we can evict do not reuse if entry is offloaded. */
+ if (nf_ct_kill(ct))
+ taken = flags & flags_offload;
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
const struct nf_nat_range2 *range)
{
@@ -245,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- case IPPROTO_DCCP:
case IPPROTO_SCTP:
if (maniptype == NF_NAT_MANIP_SRC)
port = tuple->src.u.all;
@@ -262,7 +444,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range.
*/
-static int in_range(const struct nf_conntrack_tuple *tuple,
+static int nf_in_range(const struct nf_conntrack_tuple *tuple,
const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
@@ -299,7 +481,7 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range2 *range)
{
- unsigned int h = hash_by_src(net, tuple);
+ unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn *ct;
hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
@@ -311,7 +493,7 @@ find_appropriate_src(struct net *net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
- if (in_range(result, range))
+ if (nf_in_range(result, range))
return 1;
}
}
@@ -405,10 +587,9 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
- static const unsigned int max_attempts = 128;
switch (tuple->dst.protonum) {
- case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
/* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id;
@@ -442,11 +623,10 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
}
goto find_free_id;
#endif
- case IPPROTO_UDP: /* fallthrough */
- case IPPROTO_UDPLITE: /* fallthrough */
- case IPPROTO_TCP: /* fallthrough */
- case IPPROTO_SCTP: /* fallthrough */
- case IPPROTO_DCCP: /* fallthrough */
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.all;
else
@@ -487,12 +667,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
+ else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
+ maniptype != NF_NAT_MANIP_DST)
+ off = get_random_u16();
else
- off = prandom_u32();
+ off = 0;
attempts = range_size;
- if (attempts > max_attempts)
- attempts = max_attempts;
+ if (attempts > NF_NAT_MAX_ATTEMPTS)
+ attempts = NF_NAT_MAX_ATTEMPTS;
/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
@@ -503,14 +686,14 @@ find_free_id:
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
- if (!nf_nat_used_tuple(tuple, ct))
+ if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
- off = prandom_u32();
+ off = get_random_u16();
goto another_round;
}
@@ -543,8 +726,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
- if (in_range(orig_tuple, range)) {
- if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ if (nf_in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple_new(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
@@ -569,8 +752,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
@@ -646,7 +829,7 @@ nf_nat_setup_info(struct nf_conn *ct,
unsigned int srchash;
spinlock_t *lock;
- srchash = hash_by_src(net,
+ srchash = hash_by_src(net, nf_ct_zone(ct),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
@@ -719,6 +902,16 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
}
EXPORT_SYMBOL_GPL(nf_nat_packet);
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -735,7 +928,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
- if (!ct)
+ if (!ct || in_vrf_postrouting(state))
return NF_ACCEPT;
nat = nfct_nat(ct);
@@ -811,11 +1004,11 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
- h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
hlist_del_rcu(&ct->nat_bysource);
spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
@@ -833,7 +1026,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
- __nf_nat_cleanup_conntrack(ct);
+ nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -841,20 +1034,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- if (ct->status & IPS_SRC_NAT_DONE)
- __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .id = NF_CT_EXT_NAT,
-};
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -911,10 +1090,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
range->flags |= NF_NAT_RANGE_MAP_IPS;
}
- if (tb[CTA_NAT_V4_MAXIP])
- range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
- else
- range->max_addr.ip = range->min_addr.ip;
+ range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP],
+ range->min_addr.ip);
return 0;
}
@@ -1041,7 +1218,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
if (!nat_proto_net->nat_hook_ops) {
WARN_ON(nat_proto_net->users != 0);
- nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+ nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL);
if (!nat_ops) {
mutex_unlock(&nf_nat_proto_mutex);
return -ENOMEM;
@@ -1140,12 +1317,12 @@ static struct pernet_operations nat_net_ops = {
.size = sizeof(struct nat_net),
};
-static struct nf_nat_hook nat_hook = {
+static const struct nf_nat_hook nat_hook = {
.parse_nat_setup = nfnetlink_parse_nat_setup,
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
- .manip_pkt = nf_nat_manip_pkt,
+ .remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
static int __init nf_nat_init(void)
@@ -1161,19 +1338,12 @@ static int __init nf_nat_init(void)
if (!nf_nat_bysource)
return -ENOMEM;
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- kvfree(nf_nat_bysource);
- pr_err("Unable to register extension\n");
- return ret;
- }
-
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
- nf_ct_extend_unregister(&nat_extend);
+ kvfree(nf_nat_bysource);
return ret;
}
@@ -1182,7 +1352,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return 0;
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
@@ -1191,7 +1370,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
- nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
@@ -1201,6 +1379,7 @@ static void __exit nf_nat_cleanup(void)
}
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Network address translation core");
module_init(nf_nat_init);
module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index aace6768a64e..c92a436d9c48 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -86,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
- nf_ct_helper_log(skb, ct, "all ports in use");
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index a263505455fc..bf591e6af005 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -198,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct,
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL(nf_nat_follow_master);
+
+u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port)
+{
+ static const unsigned int max_attempts = 128;
+ int range, attempts_left;
+ u16 min = port;
+
+ range = USHRT_MAX - port;
+ attempts_left = range;
+
+ if (attempts_left > max_attempts)
+ attempts_left = max_attempts;
+
+ /* Try to get same port: if not, try to change it. */
+ for (;;) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp, 0);
+ if (res == 0)
+ return port;
+
+ if (res != -EBUSY || (--attempts_left < 0))
+ break;
+
+ port = min + get_random_u32_below(range);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_exp_find_port);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index c691ab8d234c..19c4fcc60c50 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -48,20 +48,8 @@ static unsigned int help(struct sk_buff *skb,
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp, 0);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp,
+ ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use");
return NF_DROP;
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index 8e8a65d46345..1a506b0c6511 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -9,8 +9,20 @@
#include <net/netfilter/nf_nat_masquerade.h>
+struct masq_dev_work {
+ struct work_struct work;
+ struct net *net;
+ netns_tracker ns_tracker;
+ union nf_inet_addr addr;
+ int ifindex;
+ int (*iter)(struct nf_conn *i, void *data);
+};
+
+#define MAX_MASQ_WORKER_COUNT 16
+
static DEFINE_MUTEX(masq_mutex);
static unsigned int masq_refcnt __read_mostly;
+static atomic_t masq_worker_count __read_mostly;
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
@@ -63,13 +75,75 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
-static int device_cmp(struct nf_conn *i, void *ifindex)
+static void iterate_cleanup_work(struct work_struct *work)
+{
+ struct nf_ct_iter_data iter_data = {};
+ struct masq_dev_work *w;
+
+ w = container_of(work, struct masq_dev_work, work);
+
+ iter_data.net = w->net;
+ iter_data.data = (void *)w;
+ nf_ct_iterate_cleanup_net(w->iter, &iter_data);
+
+ put_net_track(w->net, &w->ns_tracker);
+ kfree(w);
+ atomic_dec(&masq_worker_count);
+ module_put(THIS_MODULE);
+}
+
+/* Iterate conntrack table in the background and remove conntrack entries
+ * that use the device/address being removed.
+ *
+ * In case too many work items have been queued already or memory allocation
+ * fails iteration is skipped, conntrack entries will time out eventually.
+ */
+static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
+ int ifindex,
+ int (*iter)(struct nf_conn *i, void *data),
+ gfp_t gfp_flags)
+{
+ struct masq_dev_work *w;
+
+ if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
+ return;
+
+ net = maybe_get_net(net);
+ if (!net)
+ return;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err_module;
+
+ w = kzalloc(sizeof(*w), gfp_flags);
+ if (w) {
+ /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
+ atomic_inc(&masq_worker_count);
+
+ INIT_WORK(&w->work, iterate_cleanup_work);
+ w->ifindex = ifindex;
+ w->net = net;
+ netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
+ w->iter = iter;
+ if (addr)
+ w->addr = *addr;
+ schedule_work(&w->work);
+ return;
+ }
+
+ module_put(THIS_MODULE);
+ err_module:
+ put_net(net);
+}
+
+static int device_cmp(struct nf_conn *i, void *arg)
{
const struct nf_conn_nat *nat = nfct_nat(i);
+ const struct masq_dev_work *w = arg;
if (!nat)
return 0;
- return nat->masq_index == (int)(long)ifindex;
+ return nat->masq_index == w->ifindex;
}
static int masq_device_event(struct notifier_block *this,
@@ -85,8 +159,8 @@ static int masq_device_event(struct notifier_block *this,
* and forget them.
*/
- nf_ct_iterate_cleanup_net(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
+ nf_nat_masq_schedule(net, NULL, dev->ifindex,
+ device_cmp, GFP_KERNEL);
}
return NOTIFY_DONE;
@@ -94,35 +168,45 @@ static int masq_device_event(struct notifier_block *this,
static int inet_cmp(struct nf_conn *ct, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
- struct net_device *dev = ifa->ifa_dev->dev;
struct nf_conntrack_tuple *tuple;
+ struct masq_dev_work *w = ptr;
- if (!device_cmp(ct, (void *)(long)dev->ifindex))
+ if (!device_cmp(ct, ptr))
return 0;
tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- return ifa->ifa_address == tuple->dst.u3.ip;
+ return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
}
static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
- struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct net *net = dev_net(idev->dev);
+ const struct in_ifaddr *ifa = ptr;
+ const struct in_device *idev;
+ const struct net_device *dev;
+ union nf_inet_addr addr;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
/* The masq_dev_notifier will catch the case of the device going
* down. So if the inetdev is dead and being destroyed we have
* no work to do. Otherwise this is an individual address removal
* and we have to perform the flush.
*/
+ idev = ifa->ifa_dev;
if (idev->dead)
return NOTIFY_DONE;
- if (event == NETDEV_DOWN)
- nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0);
+ memset(&addr, 0, sizeof(addr));
+
+ addr.ip = ifa->ifa_address;
+
+ dev = idev->dev;
+ nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
+ inet_cmp, GFP_KERNEL);
return NOTIFY_DONE;
}
@@ -136,8 +220,6 @@ static struct notifier_block masq_inet_notifier = {
};
#if IS_ENABLED(CONFIG_IPV6)
-static atomic_t v6_worker_count __read_mostly;
-
static int
nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
const struct in6_addr *daddr, unsigned int srcprefs,
@@ -187,40 +269,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
-struct masq_dev_work {
- struct work_struct work;
- struct net *net;
- struct in6_addr addr;
- int ifindex;
-};
-
-static int inet6_cmp(struct nf_conn *ct, void *work)
-{
- struct masq_dev_work *w = (struct masq_dev_work *)work;
- struct nf_conntrack_tuple *tuple;
-
- if (!device_cmp(ct, (void *)(long)w->ifindex))
- return 0;
-
- tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-
- return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6);
-}
-
-static void iterate_cleanup_work(struct work_struct *work)
-{
- struct masq_dev_work *w;
-
- w = container_of(work, struct masq_dev_work, work);
-
- nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0);
-
- put_net(w->net);
- kfree(w);
- atomic_dec(&v6_worker_count);
- module_put(THIS_MODULE);
-}
-
/* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
*
* Defer it to the system workqueue.
@@ -233,36 +281,19 @@ static int masq_inet6_event(struct notifier_block *this,
{
struct inet6_ifaddr *ifa = ptr;
const struct net_device *dev;
- struct masq_dev_work *w;
- struct net *net;
+ union nf_inet_addr addr;
- if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16)
+ if (event != NETDEV_DOWN)
return NOTIFY_DONE;
dev = ifa->idev->dev;
- net = maybe_get_net(dev_net(dev));
- if (!net)
- return NOTIFY_DONE;
-
- if (!try_module_get(THIS_MODULE))
- goto err_module;
- w = kmalloc(sizeof(*w), GFP_ATOMIC);
- if (w) {
- atomic_inc(&v6_worker_count);
+ memset(&addr, 0, sizeof(addr));
- INIT_WORK(&w->work, iterate_cleanup_work);
- w->ifindex = dev->ifindex;
- w->net = net;
- w->addr = ifa->addr;
- schedule_work(&w->work);
+ addr.in6 = ifa->addr;
- return NOTIFY_DONE;
- }
-
- module_put(THIS_MODULE);
- err_module:
- put_net(net);
+ nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
+ GFP_ATOMIC);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
new file mode 100644
index 000000000000..0f9a559f6207
--- /dev/null
+++ b/net/netfilter/nf_nat_ovs.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_nat.h>
+
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ __be16 proto = skb_protocol(skb, true);
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (proto == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ fallthrough;
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+ if (err == NF_ACCEPT)
+ *action |= BIT(maniptype);
+
+ return err;
+}
+
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range, bool commit)
+{
+ enum nf_nat_manip_type maniptype;
+ int err, ct_action = *action;
+
+ *action = 0;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & BIT(NF_NAT_MANIP_DST)) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype);
+ if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+ if (ct->status & IPS_SRC_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range,
+ maniptype);
+ } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL,
+ NF_NAT_MANIP_SRC);
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat);
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 59151dc07fdc..b14a434b9561 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -180,46 +180,6 @@ tcp_manip_pkt(struct sk_buff *skb,
}
static bool
-dccp_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct dccp_hdr *hdr;
- __be16 *portptr, oldport, newport;
- int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
- if (skb->len >= hdroff + sizeof(struct dccp_hdr))
- hdrsize = sizeof(struct dccp_hdr);
-
- if (skb_ensure_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- newport = tuple->src.u.dccp.port;
- portptr = &hdr->dccph_sport;
- } else {
- newport = tuple->dst.u.dccp.port;
- portptr = &hdr->dccph_dport;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype);
- inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- false);
-#endif
- return true;
-}
-
-static bool
icmp_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
@@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb,
case IPPROTO_ICMPV6:
return icmpv6_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
- case IPPROTO_DCCP:
- return dccp_manip_pkt(skb, iphdroff, hdroff,
- tuple, maniptype);
case IPPROTO_GRE:
return gre_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
@@ -646,8 +603,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
}
static unsigned int
-nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
+nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
unsigned int ret;
__be32 daddr = ip_hdr(skb)->daddr;
@@ -659,6 +616,98 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
return ret;
}
+#ifdef CONFIG_XFRM
+static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
+{
+ struct sock *sk = skb->sk;
+ struct dst_entry *dst;
+ unsigned int hh_len;
+ struct flowi fl;
+ int err;
+
+ err = xfrm_decode_session(net, skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ if (!dst_hold_safe(dst))
+ return -EHOSTUNREACH;
+
+ if (sk && !net_eq(net, sock_net(sk)))
+ sk = NULL;
+
+ dst = xfrm_lookup(net, dst, &fl, sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+#endif
+
+static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
+{
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ const struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ return false;
+ }
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir != IP_CT_DIR_ORIGINAL)
+ return false;
+
+ return ct->tuplehash[!dir].tuple.dst.u.all != sport;
+}
+
+static unsigned int
+nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ __be32 saddr = ip_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* skb has a socket assigned via tcp edemux. We need to check
+ * if nf_nat_ipv4_fn() has mangled the packet in a way that
+ * edemux would not have found this socket.
+ *
+ * This includes both changes to the source address and changes
+ * to the source port, which are both handled by the
+ * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
+ * might have found a socket for the old (pre-snat) address.
+ */
+ if (saddr != ip_hdr(skb)->saddr ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb); /* TCP edemux obtained wrong socket */
+
+ return ret;
+}
+
static unsigned int
nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -715,7 +764,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -736,7 +785,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
/* Before packet filtering, change destination */
{
- .hook = nf_nat_ipv4_in,
+ .hook = nf_nat_ipv4_pre_routing,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
@@ -757,7 +806,7 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv4_fn,
+ .hook = nf_nat_ipv4_local_in,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
@@ -883,14 +932,36 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
}
static unsigned int
+nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr = ipv6_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* see nf_nat_ipv4_local_in */
+ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb);
+
+ return ret;
+}
+
+static unsigned int
nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- unsigned int ret;
+ unsigned int ret, verdict;
struct in6_addr daddr = ipv6_hdr(skb)->daddr;
ret = nf_nat_ipv6_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
+ verdict = ret & NF_VERDICT_MASK;
+ if (verdict != NF_DROP && verdict != NF_STOLEN &&
ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
skb_dst_drop(skb);
@@ -953,7 +1024,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
&ct->tuplehash[!dir].tuple.src.u3)) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -996,7 +1067,7 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
},
/* After packet filtering, change source */
{
- .hook = nf_nat_ipv6_fn,
+ .hook = nf_nat_ipv6_local_in,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index f91579c821e9..5b37487d9d11 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -10,6 +10,7 @@
#include <linux/if.h>
#include <linux/inetdevice.h>
+#include <linux/in.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
@@ -24,81 +25,104 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_redirect.h>
+static unsigned int
+nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
+ const union nf_inet_addr *newdst)
+{
+ struct nf_nat_range2 newrange;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ memset(&newrange, 0, sizeof(newrange));
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = *newdst;
+ newrange.max_addr = *newdst;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
- const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be32 newdst;
- struct nf_nat_range2 newrange;
+ union nf_inet_addr newdst = {};
WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
hooknum != NF_INET_LOCAL_OUT);
- ct = nf_ct_get(skb, &ctinfo);
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
-
/* Local packets: make them go to loopback */
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = htonl(0x7F000001);
+ newdst.ip = htonl(INADDR_LOOPBACK);
} else {
const struct in_device *indev;
- newdst = 0;
-
indev = __in_dev_get_rcu(skb->dev);
if (indev) {
const struct in_ifaddr *ifa;
ifa = rcu_dereference(indev->ifa_list);
if (ifa)
- newdst = ifa->ifa_local;
+ newdst.ip = ifa->ifa_local;
}
- if (!newdst)
+ if (!newdst.ip)
return NF_DROP;
}
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newdst;
- newrange.max_addr.ip = newdst;
- newrange.min_proto = mr->range[0].min;
- newrange.max_proto = mr->range[0].max;
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope)
+{
+ unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr);
+
+ if (ifa_addr_type & IPV6_ADDR_MAPPED)
+ return false;
+
+ if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC)))
+ return false;
+
+ if (scope) {
+ unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK;
+
+ if (!(scope & ifa_scope))
+ return false;
+ }
+
+ return true;
+}
+
unsigned int
nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_nat_range2 newrange;
- struct in6_addr newdst;
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
+ union nf_inet_addr newdst = {};
- ct = nf_ct_get(skb, &ctinfo);
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = loopback_addr;
+ newdst.in6 = loopback_addr;
} else {
+ unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
bool addr = false;
idev = __in6_dev_get(skb->dev);
if (idev != NULL) {
+ const struct inet6_ifaddr *ifa;
+
read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
- newdst = ifa->addr;
+ if (!nf_nat_redirect_ipv6_usable(ifa, scope))
+ continue;
+
+ newdst.in6 = ifa->addr;
addr = true;
break;
}
@@ -109,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
return NF_DROP;
}
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.in6 = newdst;
- newrange.max_addr.in6 = newdst;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index f0a735e86851..cf4aeb299bde 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -410,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
exp->dir = !dir;
exp->expectfn = nf_nat_sip_expected;
- for (; port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, port);
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use for SIP");
return NF_DROP;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index bbd1209694b8..7f12e56e6e52 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -21,6 +21,8 @@
#include "nf_internals.h"
+static const struct nf_queue_handler __rcu *nf_queue_handler;
+
/*
* Hook for nfnetlink_queue to register its queue handler.
* We do this so that most of the NFQUEUE code can be modular.
@@ -29,40 +31,43 @@
* receives, no matter what.
*/
-/* return EBUSY when somebody else is registered, return EEXIST if the
- * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(net->nf.queue_handler));
- rcu_assign_pointer(net->nf.queue_handler, qh);
+ WARN_ON(rcu_access_pointer(nf_queue_handler));
+ rcu_assign_pointer(nf_queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(struct net *net)
+void nf_unregister_queue_handler(void)
{
- RCU_INIT_POINTER(net->nf.queue_handler, NULL);
+ RCU_INIT_POINTER(nf_queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
+static void nf_queue_sock_put(struct sock *sk)
+{
+#ifdef CONFIG_INET
+ sock_gen_put(sk);
+#else
+ sock_put(sk);
+#endif
+}
+
static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
/* Release those devices we held, or Alexey will kill me. */
- if (state->in)
- dev_put(state->in);
- if (state->out)
- dev_put(state->out);
+ dev_put(state->in);
+ dev_put(state->out);
if (state->sk)
- sock_put(state->sk);
+ nf_queue_sock_put(state->sk);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (entry->physin)
- dev_put(entry->physin);
- if (entry->physout)
- dev_put(entry->physout);
+ dev_put(entry->physin);
+ dev_put(entry->physout);
#endif
}
@@ -77,11 +82,9 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
const struct sk_buff *skb = entry->skb;
- struct nf_bridge_info *nf_bridge;
- nf_bridge = nf_bridge_info_get(skb);
- if (nf_bridge) {
- entry->physin = nf_bridge_get_physindev(skb);
+ if (nf_bridge_info_exists(skb)) {
+ entry->physin = nf_bridge_get_physindev(skb, entry->state.net);
entry->physout = nf_bridge_get_physoutdev(skb);
} else {
entry->physin = NULL;
@@ -91,23 +94,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
}
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
- if (state->in)
- dev_hold(state->in);
- if (state->out)
- dev_hold(state->out);
- if (state->sk)
- sock_hold(state->sk);
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
+
+ dev_hold(state->in);
+ dev_hold(state->out);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- if (entry->physin)
- dev_hold(entry->physin);
- if (entry->physout)
- dev_hold(entry->physout);
+ dev_hold(entry->physin);
+ dev_hold(entry->physout);
#endif
+ return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -116,7 +117,7 @@ void nf_queue_nf_hook_drop(struct net *net)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(net->nf.queue_handler);
+ qh = rcu_dereference(nf_queue_handler);
if (qh)
qh->nf_hook_drop(net);
rcu_read_unlock();
@@ -157,12 +158,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
{
struct nf_queue_entry *entry = NULL;
const struct nf_queue_handler *qh;
- struct net *net = state->net;
unsigned int route_key_size;
int status;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(net->nf.queue_handler);
+ qh = rcu_dereference(nf_queue_handler);
if (!qh)
return -ESRCH;
@@ -178,6 +178,18 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
break;
}
+ if (skb_sk_is_prefetched(skb)) {
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk)) {
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ return -ENOTCONN;
+
+ /* drop refcount on skb_orphan */
+ skb->destructor = sock_edemux;
+ }
+ }
+
entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
if (!entry)
return -ENOMEM;
@@ -196,7 +208,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
__nf_queue_entry_init_physdevs(entry);
- nf_queue_entry_get_refs(entry);
+ if (!nf_queue_entry_get_refs(entry)) {
+ kfree(entry);
+ return -ENOTCONN;
+ }
switch (entry->state.pf) {
case AF_INET:
@@ -233,109 +248,3 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
return 0;
}
EXPORT_SYMBOL_GPL(nf_queue);
-
-static unsigned int nf_iterate(struct sk_buff *skb,
- struct nf_hook_state *state,
- const struct nf_hook_entries *hooks,
- unsigned int *index)
-{
- const struct nf_hook_entry *hook;
- unsigned int verdict, i = *index;
-
- while (i < hooks->num_hook_entries) {
- hook = &hooks->hooks[i];
-repeat:
- verdict = nf_hook_entry_hookfn(hook, skb, state);
- if (verdict != NF_ACCEPT) {
- *index = i;
- if (verdict != NF_REPEAT)
- return verdict;
- goto repeat;
- }
- i++;
- }
-
- *index = i;
- return NF_ACCEPT;
-}
-
-static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
-{
- switch (pf) {
-#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
- case NFPROTO_BRIDGE:
- return rcu_dereference(net->nf.hooks_bridge[hooknum]);
-#endif
- case NFPROTO_IPV4:
- return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
- case NFPROTO_IPV6:
- return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
- default:
- WARN_ON_ONCE(1);
- return NULL;
- }
-
- return NULL;
-}
-
-/* Caller must hold rcu read-side lock */
-void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-{
- const struct nf_hook_entry *hook_entry;
- const struct nf_hook_entries *hooks;
- struct sk_buff *skb = entry->skb;
- const struct net *net;
- unsigned int i;
- int err;
- u8 pf;
-
- net = entry->state.net;
- pf = entry->state.pf;
-
- hooks = nf_hook_entries_head(net, pf, entry->state.hook);
-
- i = entry->hook_index;
- if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
- kfree_skb(skb);
- nf_queue_entry_free(entry);
- return;
- }
-
- hook_entry = &hooks->hooks[i];
-
- /* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT)
- verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
-
- if (verdict == NF_ACCEPT) {
- if (nf_reroute(skb, entry) < 0)
- verdict = NF_DROP;
- }
-
- if (verdict == NF_ACCEPT) {
-next_hook:
- ++i;
- verdict = nf_iterate(skb, &entry->state, hooks, &i);
- }
-
- switch (verdict & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_STOP:
- local_bh_disable();
- entry->state.okfn(entry->state.net, entry->state.sk, skb);
- local_bh_enable();
- break;
- case NF_QUEUE:
- err = nf_queue(skb, &entry->state, i, verdict);
- if (err == 1)
- goto next_hook;
- break;
- case NF_STOLEN:
- break;
- default:
- kfree_skb(skb);
- }
-
- nf_queue_entry_free(entry);
-}
-EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 46cb3786e0ec..34afcd03b6f6 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -89,78 +89,32 @@ out:
return ops;
}
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt,
+ unsigned int len)
{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 0);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get)
- ret = ops->get(sk, val, opt, len);
- else
- ret = ops->set(sk, val, opt, *len);
-
+ ret = ops->set(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
- unsigned int len)
-{
- return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
EXPORT_SYMBOL(nf_setsockopt);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
int *len)
{
- return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(nf_getsockopt);
-
-#ifdef CONFIG_COMPAT
-static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
-{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 1);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get) {
- if (ops->compat_get)
- ret = ops->compat_get(sk, val, opt, len);
- else
- ret = ops->get(sk, val, opt, len);
- } else {
- if (ops->compat_set)
- ret = ops->compat_set(sk, val, opt, *len);
- else
- ret = ops->set(sk, val, opt, *len);
- }
-
+ ret = ops->get(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, unsigned int len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-EXPORT_SYMBOL(compat_nf_setsockopt);
-
-int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, int *len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(compat_nf_getsockopt);
-#endif
+EXPORT_SYMBOL(nf_getsockopt);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index b9cbe1e2453e..3fa3f5dfb264 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -5,7 +5,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
@@ -31,6 +31,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
int length = (th->doff * 4) - sizeof(*th);
u8 buf[40], *ptr;
+ if (unlikely(length < 0))
+ return false;
+
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
if (ptr == NULL)
return false;
@@ -47,6 +50,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
length--;
continue;
default:
+ if (length < 2)
+ return true;
opsize = *ptr++;
if (opsize < 2)
return true;
@@ -148,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
- opts->tsval = tcp_time_stamp_raw() & ~0x3f;
+ opts->tsval = tcp_clock_ms() & ~0x3f;
if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
@@ -231,12 +236,6 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
return 1;
}
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
- .len = sizeof(struct nf_conn_synproxy),
- .align = __alignof__(struct nf_conn_synproxy),
- .id = NF_CT_EXT_SYNPROXY,
-};
-
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -344,7 +343,6 @@ static int __net_init synproxy_net_init(struct net *net)
goto err2;
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -383,28 +381,12 @@ static struct pernet_operations synproxy_net_ops = {
static int __init synproxy_core_init(void)
{
- int err;
-
- err = nf_ct_extend_register(&nf_ct_synproxy_extend);
- if (err < 0)
- goto err1;
-
- err = register_pernet_subsys(&synproxy_net_ops);
- if (err < 0)
- goto err2;
-
- return 0;
-
-err2:
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
- return err;
+ return register_pernet_subsys(&synproxy_net_ops);
}
static void __exit synproxy_core_exit(void)
{
unregister_pernet_subsys(&synproxy_net_ops);
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
}
module_init(synproxy_core_init);
@@ -423,7 +405,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
+ iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
@@ -446,7 +428,7 @@ synproxy_send_tcp(struct net *net,
skb_dst_set_noref(nskb, skb_dst(skb));
nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
goto free_nskb;
if (nfct) {
@@ -635,7 +617,7 @@ synproxy_recv_client_ack(struct net *net,
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
- mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ mss = __cookie_v4_check(ip_hdr(skb), th);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
@@ -704,8 +686,7 @@ ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
+ fallthrough;
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
@@ -819,7 +800,7 @@ synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
skb_reset_network_header(skb);
iph = skb_put(skb, sizeof(*iph));
ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
+ iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit);
iph->nexthdr = IPPROTO_TCP;
iph->saddr = *saddr;
iph->daddr = *daddr;
@@ -850,7 +831,7 @@ synproxy_send_tcp_ipv6(struct net *net,
fl6.fl6_sport = nth->source;
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb,
- flowi6_to_flowi(&fl6));
+ flowi6_to_flowi_common(&fl6));
err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
if (err) {
goto free_nskb;
@@ -1053,7 +1034,7 @@ synproxy_recv_client_ack_ipv6(struct net *net,
struct synproxy_net *snet = synproxy_pernet(net);
int mss;
- mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
+ mss = nf_cookie_v6_check(ipv6_hdr(skb), th);
if (mss == 0) {
this_cpu_inc(snet->stats->cookie_invalid);
return false;
@@ -1128,8 +1109,7 @@ ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
nf_ct_seqadj_init(ct, ctinfo, 0);
synproxy->tsoff = 0;
this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
+ fallthrough;
case TCP_CONNTRACK_SYN_SENT:
if (!synproxy_parse_options(skb, thoff, th, &opts))
return NF_DROP;
@@ -1237,3 +1217,4 @@ EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 073aa1051d43..f3de2f9bbebf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -12,6 +12,7 @@
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/rhashtable.h>
+#include <linux/audit.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
@@ -23,13 +24,19 @@
#include <net/sock.h>
#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
+#define NFT_SET_MAX_ANONLEN 16
+
+/* limit compaction to avoid huge kmalloc/krealloc sizes. */
+#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
+
+unsigned int nf_tables_net_id __read_mostly;
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
-static u64 table_handle;
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
enum {
NFT_VALIDATE_SKIP = 0,
@@ -65,9 +72,45 @@ static const struct rhashtable_params nft_objname_ht_params = {
.automatic_shrinking = true,
};
-static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+struct nft_audit_data {
+ struct nft_table *table;
+ int entries;
+ int op;
+ struct list_head list;
+};
+
+static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
+ [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER,
+ [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER,
+ [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER,
+ [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER,
+ [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER,
+ [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER,
+ [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER,
+ [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER,
+ [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER,
+ [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER,
+ [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER,
+ [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER,
+ [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER,
+ [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET,
+ [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
+ [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+ [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
+};
+
+static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
{
- switch (net->nft.validate_state) {
+ switch (table->validate_state) {
case NFT_VALIDATE_SKIP:
WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
break;
@@ -78,10 +121,12 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
return;
}
- net->nft.validate_state = new_validate_state;
+ table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
-static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
+
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
@@ -102,108 +147,224 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->report = nlmsg_report(nlh);
ctx->flags = nlh->nlmsg_flags;
ctx->seq = nlh->nlmsg_seq;
+
+ bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}
-static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
- int msg_type, u32 size, gfp_t gfp)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
+ trans = kzalloc(size, GFP_KERNEL);
if (trans == NULL)
return NULL;
+ INIT_LIST_HEAD(&trans->list);
trans->msg_type = msg_type;
- trans->ctx = *ctx;
+
+ trans->net = ctx->net;
+ trans->table = ctx->table;
+ trans->seq = ctx->seq;
+ trans->flags = ctx->flags;
+ trans->report = ctx->report;
return trans;
}
-static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
- int msg_type, u32 size)
+static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
{
- return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ case NFT_MSG_NEWSET:
+ return container_of(trans, struct nft_trans_binding, nft_trans);
+ }
+
+ return NULL;
}
-static void nft_trans_destroy(struct nft_trans *trans)
+static void nft_trans_list_del(struct nft_trans *trans)
{
+ struct nft_trans_binding *trans_binding;
+
list_del(&trans->list);
+
+ trans_binding = nft_trans_get_binding(trans);
+ if (trans_binding)
+ list_del(&trans_binding->binding_list);
+}
+
+static void nft_trans_destroy(struct nft_trans *trans)
+{
+ nft_trans_list_del(trans);
kfree(trans);
}
-static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set,
+ bool bind)
{
+ struct nftables_pernet *nft_net;
struct net *net = ctx->net;
struct nft_trans *trans;
if (!nft_set_is_anonymous(set))
return;
- list_for_each_entry_reverse(trans, &net->nft.commit_list, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
switch (trans->msg_type) {
case NFT_MSG_NEWSET:
if (nft_trans_set(trans) == set)
- nft_trans_set_bound(trans) = true;
+ nft_trans_set_bound(trans) = bind;
break;
case NFT_MSG_NEWSETELEM:
if (nft_trans_elem_set(trans) == set)
- nft_trans_elem_set_bound(trans) = true;
+ nft_trans_elem_set_bound(trans) = bind;
break;
}
}
}
+static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, true);
+}
+
+static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, false);
+}
+
+static void __nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain, bool bind)
+{
+ struct nftables_pernet *nft_net;
+ struct net *net = ctx->net;
+ struct nft_trans *trans;
+
+ if (!nft_chain_binding(chain))
+ return;
+
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain(trans) == chain)
+ nft_trans_chain_bound(trans) = bind;
+ break;
+ case NFT_MSG_NEWRULE:
+ if (nft_trans_rule_chain(trans) == chain)
+ nft_trans_rule_bound(trans) = bind;
+ break;
+ }
+ }
+}
+
+static void nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, true);
+}
+
+int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ if (!nft_chain_binding(chain))
+ return 0;
+
+ if (nft_chain_binding(ctx->chain))
+ return -EOPNOTSUPP;
+
+ if (chain->bound)
+ return -EBUSY;
+
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
+
+ chain->bound = true;
+ nft_chain_trans_bind(ctx, chain);
+
+ return 0;
+}
+
+void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, false);
+}
+
static int nft_netdev_register_hooks(struct net *net,
struct list_head *hook_list)
{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, j;
j = 0;
list_for_each_entry(hook, hook_list, list) {
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0)
- goto err_register;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nf_register_net_hook(net, ops);
+ if (err < 0)
+ goto err_register;
- j++;
+ j++;
+ }
}
return 0;
err_register:
list_for_each_entry(hook, hook_list, list) {
- if (j-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (j-- <= 0)
+ break;
- nf_unregister_net_hook(net, &hook->ops);
+ nf_unregister_net_hook(net, ops);
+ }
}
return err;
}
-static void nft_netdev_unregister_hooks(struct net *net,
- struct list_head *hook_list)
+static void nft_netdev_hook_free_ops(struct nft_hook *hook)
{
- struct nft_hook *hook;
+ struct nf_hook_ops *ops, *next;
- list_for_each_entry(hook, hook_list, list)
- nf_unregister_net_hook(net, &hook->ops);
+ list_for_each_entry_safe(ops, next, &hook->ops_list, list) {
+ list_del(&ops->list);
+ kfree(ops);
+ }
}
-static int nft_register_basechain_hooks(struct net *net, int family,
- struct nft_base_chain *basechain)
+static void nft_netdev_hook_free(struct nft_hook *hook)
{
- if (family == NFPROTO_NETDEV)
- return nft_netdev_register_hooks(net, &basechain->hook_list);
+ nft_netdev_hook_free_ops(hook);
+ kfree(hook);
+}
- return nf_register_net_hook(net, &basechain->ops);
+static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu)
+{
+ struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu);
+
+ nft_netdev_hook_free(hook);
}
-static void nft_unregister_basechain_hooks(struct net *net, int family,
- struct nft_base_chain *basechain)
+static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
{
- if (family == NFPROTO_NETDEV)
- nft_netdev_unregister_hooks(net, &basechain->hook_list);
- else
- nf_unregister_net_hook(net, &basechain->ops);
+ call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
+}
+
+static void nft_netdev_unregister_hooks(struct net *net,
+ struct list_head *hook_list,
+ bool release_netdev)
+{
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ }
}
static int nf_tables_register_hook(struct net *net,
@@ -223,12 +384,16 @@ static int nf_tables_register_hook(struct net *net,
if (basechain->type->ops_register)
return basechain->type->ops_register(net, ops);
- return nft_register_basechain_hooks(net, table->family, basechain);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_netdev_register_hooks(net, &basechain->hook_list);
+
+ return nf_register_net_hook(net, &basechain->ops);
}
-static void nf_tables_unregister_hook(struct net *net,
- const struct nft_table *table,
- struct nft_chain *chain)
+static void __nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain,
+ bool release_netdev)
{
struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
@@ -242,7 +407,144 @@ static void nf_tables_unregister_hook(struct net *net,
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- nft_unregister_basechain_hooks(net, table->family, basechain);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ nft_netdev_unregister_hooks(net, &basechain->hook_list,
+ release_netdev);
+ else
+ nf_unregister_net_hook(net, &basechain->ops);
+}
+
+static void nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain)
+{
+ return __nf_tables_unregister_hook(net, table, chain, false);
+}
+
+static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
+{
+ /* NB: the ->bound equality check is defensive, at this time we only merge
+ * a new nft_trans_elem transaction request with the transaction tail
+ * element, but a->bound != b->bound would imply a NEWRULE transaction
+ * is queued in-between.
+ *
+ * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents
+ * huge krealloc() requests.
+ */
+ return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
+}
+
+static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
+ struct nft_trans_elem *tail,
+ struct nft_trans_elem *trans)
+{
+ unsigned int nelems, old_nelems = tail->nelems;
+ struct nft_trans_elem *new_trans;
+
+ if (!nft_trans_collapse_set_elem_allowed(tail, trans))
+ return false;
+
+ /* "cannot happen", at this time userspace element add
+ * requests always allocate a new transaction element.
+ *
+ * This serves as a reminder to adjust the list_add_tail
+ * logic below in case this ever changes.
+ */
+ if (WARN_ON_ONCE(trans->nelems != 1))
+ return false;
+
+ if (check_add_overflow(old_nelems, trans->nelems, &nelems))
+ return false;
+
+ /* krealloc might free tail which invalidates list pointers */
+ list_del_init(&tail->nft_trans.list);
+
+ new_trans = krealloc(tail, struct_size(tail, elems, nelems),
+ GFP_KERNEL);
+ if (!new_trans) {
+ list_add_tail(&tail->nft_trans.list,
+ &nft_net->commit_list);
+ return false;
+ }
+
+ /*
+ * new_trans->nft_trans.list contains garbage, but
+ * list_add_tail() doesn't care.
+ */
+ new_trans->nelems = nelems;
+ new_trans->elems[old_nelems] = trans->elems[0];
+ list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);
+
+ return true;
+}
+
+static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
+ struct nft_trans *trans)
+{
+ struct nft_trans *tail;
+
+ if (list_empty(&nft_net->commit_list))
+ return false;
+
+ tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list);
+
+ if (tail->msg_type != trans->msg_type)
+ return false;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_collapse_set_elem(nft_net,
+ nft_trans_container_elem(tail),
+ nft_trans_container_elem(trans));
+ }
+
+ return false;
+}
+
+static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_trans_binding *binding;
+ struct nft_trans_set *trans_set;
+
+ list_add_tail(&trans->list, &nft_net->commit_list);
+
+ binding = nft_trans_get_binding(trans);
+ if (!binding)
+ return;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ trans_set = nft_trans_container_set(trans);
+
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+
+ list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+ break;
+ }
+}
+
+static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
+ trans->msg_type != NFT_MSG_DELSETELEM);
+
+ if (nft_trans_try_collapse(nft_net, trans)) {
+ kfree(trans);
+ return;
+ }
+
+ nft_trans_commit_list_add_tail(net, trans);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -256,7 +558,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
if (msg_type == NFT_MSG_NEWTABLE)
nft_activate_next(ctx->net, ctx->table);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -272,18 +574,41 @@ static int nft_deltable(struct nft_ctx *ctx)
return err;
}
-static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+static struct nft_trans *
+nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type)
{
+ struct nft_trans_chain *trans_chain;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
+ if (!trans)
+ return NULL;
+
+ trans_chain = nft_trans_container_chain(trans);
+ INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list);
+ trans_chain->chain = ctx->chain;
+
+ return trans;
+}
+
+static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+{
+ struct nft_trans *trans;
+
+ trans = nft_trans_alloc_chain(ctx, msg_type);
if (trans == NULL)
return ERR_PTR(-ENOMEM);
- if (msg_type == NFT_MSG_NEWCHAIN)
+ if (msg_type == NFT_MSG_NEWCHAIN) {
nft_activate_next(ctx->net, ctx->chain);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ if (ctx->nla[NFTA_CHAIN_ID]) {
+ nft_trans_chain_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
+ }
+ }
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
return trans;
}
@@ -295,19 +620,18 @@ static int nft_delchain(struct nft_ctx *ctx)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
nft_deactivate_next(ctx->net, ctx->chain);
return 0;
}
-static void nft_rule_expr_activate(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->activate)
expr->ops->activate(ctx, expr);
@@ -315,14 +639,13 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
}
}
-static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
- struct nft_rule *rule,
- enum nft_trans_phase phase)
+void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
+ enum nft_trans_phase phase)
{
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->deactivate)
expr->ops->deactivate(ctx, expr, phase);
@@ -336,7 +659,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
/* You cannot delete the same rule twice */
if (nft_is_active_next(ctx->net, rule)) {
nft_deactivate_next(ctx->net, rule);
- ctx->chain->use--;
+ nft_use_dec(&ctx->chain->use);
return 0;
}
return -ENOENT;
@@ -356,7 +679,8 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
}
nft_trans_rule(trans) = rule;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_rule_chain(trans) = ctx->chain;
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return trans;
}
@@ -407,26 +731,98 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
return 0;
}
-static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
- struct nft_set *set)
+static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set,
+ const struct nft_set_desc *desc)
{
+ struct nft_trans_set *trans_set;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
if (trans == NULL)
return -ENOMEM;
- if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
+ trans_set = nft_trans_container_set(trans);
+ INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list);
+ INIT_LIST_HEAD(&trans_set->list_trans_newset);
+
+ if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) {
nft_trans_set_id(trans) =
ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
nft_activate_next(ctx->net, set);
}
nft_trans_set(trans) = set;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ if (desc) {
+ nft_trans_set_update(trans) = true;
+ nft_trans_set_gc_int(trans) = desc->gc_int;
+ nft_trans_set_timeout(trans) = desc->timeout;
+ nft_trans_set_size(trans) = desc->size;
+ }
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
+static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set)
+{
+ return __nft_trans_set_add(ctx, msg_type, set, NULL);
+}
+
+static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+
+ return 0;
+}
+
+struct nft_set_elem_catchall {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct nft_elem_priv *elem;
+};
+
+static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_deactivate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_deactivate(ctx, set);
+}
+
static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
int err;
@@ -435,8 +831,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
if (err < 0)
return err;
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
nft_deactivate_next(ctx->net, set);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
@@ -454,7 +853,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
nft_activate_next(ctx->net, obj);
nft_trans_obj(trans) = obj;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -468,64 +867,125 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
return err;
nft_deactivate_next(ctx->net, obj);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
-static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
- struct nft_flowtable *flowtable)
+static struct nft_trans *
+nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_flowtable *flowtable)
{
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type,
sizeof(struct nft_trans_flowtable));
if (trans == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (msg_type == NFT_MSG_NEWFLOWTABLE)
nft_activate_next(ctx->net, flowtable);
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
nft_trans_flowtable(trans) = flowtable;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
- return 0;
+ return trans;
}
static int nft_delflowtable(struct nft_ctx *ctx,
struct nft_flowtable *flowtable)
{
- int err;
+ struct nft_trans *trans;
- err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
- if (err < 0)
- return err;
+ trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
nft_deactivate_next(ctx->net, flowtable);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
- return err;
+ return 0;
}
+static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
+{
+ int i;
+
+ for (i = track->regs[dreg].num_reg; i > 0; i--)
+ __nft_reg_track_cancel(track, dreg - i);
+}
+
+static void __nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr,
+ u8 dreg, u8 num_reg)
+{
+ track->regs[dreg].selector = expr;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = num_reg;
+}
+
+void nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_update(track, expr, dreg, i);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_update);
+
+void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_cancel(track, dreg);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_cancel);
+
+void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
+{
+ track->regs[dreg].selector = NULL;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = 0;
+}
+EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);
+
/*
* Tables
*/
static struct nft_table *nft_table_lookup(const struct net *net,
const struct nlattr *nla,
- u8 family, u8 genmask)
+ u8 family, u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(table, &net->nft.tables, list,
- lockdep_is_held(&net->nft.commit_mutex)) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry_rcu(table, &nft_net->tables, list,
+ lockdep_is_held(&nft_net->commit_mutex)) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
- nft_active_genmask(table, genmask))
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -533,14 +993,22 @@ static struct nft_table *nft_table_lookup(const struct net *net,
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
const struct nlattr *nla,
- u8 genmask)
+ int family, u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
- list_for_each_entry(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry(table, &nft_net->tables, list) {
if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
- nft_active_genmask(table, genmask))
+ table->family == family &&
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -586,9 +1054,11 @@ struct nft_module_request {
};
#ifdef CONFIG_MODULES
-static int nft_request_module(struct net *net, const char *fmt, ...)
+__printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
+ ...)
{
char module_name[MODULE_NAME_LEN];
+ struct nftables_pernet *nft_net;
struct nft_module_request *req;
va_list args;
int ret;
@@ -599,7 +1069,8 @@ static int nft_request_module(struct net *net, const char *fmt, ...)
if (ret >= MODULE_NAME_LEN)
return 0;
- list_for_each_entry(req, &net->nft.module_list, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry(req, &nft_net->module_list, list) {
if (!strcmp(req->module, module_name)) {
if (req->done)
return 0;
@@ -614,17 +1085,19 @@ static int nft_request_module(struct net *net, const char *fmt, ...)
return -ENOMEM;
req->done = false;
- strlcpy(req->module, module_name, MODULE_NAME_LEN);
- list_add_tail(&req->list, &net->nft.module_list);
+ strscpy(req->module, module_name, MODULE_NAME_LEN);
+ list_add_tail(&req->list, &nft_net->module_list);
return -EAGAIN;
}
+EXPORT_SYMBOL_GPL(nft_request_module);
#endif
static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ if (debug_locks)
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}
@@ -650,11 +1123,23 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
return ERR_PTR(-ENOENT);
}
+static unsigned int nft_base_seq(const struct net *net)
+{
+ return READ_ONCE(net->nft.base_seq);
+}
+
+static __be16 nft_base_seq_be16(const struct net *net)
+{
+ return htons(nft_base_seq(net) & 0xffff);
+}
+
static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
[NFTA_TABLE_NAME] = { .type = NLA_STRING,
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
[NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN }
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -662,25 +1147,38 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
- nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELTABLE ||
+ event == NFT_MSG_DESTROYTABLE) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
+ htonl(table->flags & NFT_TABLE_F_MASK)))
+ goto nla_put_failure;
+
+ if (nft_table_has_owner(table) &&
+ nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
+ goto nla_put_failure;
+
+ if (table->udata) {
+ if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
+ goto nla_put_failure;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -689,9 +1187,23 @@ nla_put_failure:
return -1;
}
+struct nftnl_skb_parms {
+ bool report;
+};
+#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb))
+
+static void nft_notify_enqueue(struct sk_buff *skb, bool report,
+ struct list_head *notify_list)
+{
+ NFT_CB(skb).report = report;
+ list_add_tail(&skb->list, notify_list);
+}
+
static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -702,15 +1214,18 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table);
+ event, flags, ctx->family, ctx->table);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -720,15 +1235,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -774,28 +1291,27 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
}
/* called with rcu_read_lock held */
-static int nf_tables_gettable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_tables,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
return PTR_ERR(table);
@@ -806,14 +1322,14 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
- family, table);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
+ 0, family, table);
if (err < 0)
- goto err;
+ goto err_fill_table_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_table_info:
kfree_skb(skb2);
return err;
}
@@ -832,8 +1348,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
if (cnt && i++ == cnt)
break;
- nft_unregister_basechain_hooks(net, table->family,
- nft_base_chain(chain));
+ nf_tables_unregister_hook(net, table, chain);
}
}
@@ -848,8 +1363,7 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table)
if (!nft_is_base_chain(chain))
continue;
- err = nft_register_basechain_hooks(net, table->family,
- nft_base_chain(chain));
+ err = nf_tables_register_hook(net, table, chain);
if (err < 0)
goto err_register_hooks;
@@ -865,25 +1379,68 @@ err_register_hooks:
static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
+ table->flags &= ~NFT_TABLE_F_DORMANT;
nft_table_disable(net, table, 0);
+ table->flags |= NFT_TABLE_F_DORMANT;
+}
+
+#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1)
+#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0)
+#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1)
+#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2)
+#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \
+ __NFT_TABLE_F_WAS_AWAKEN | \
+ __NFT_TABLE_F_WAS_ORPHAN)
+
+static bool nft_table_pending_update(const struct nft_ctx *ctx)
+{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ struct nft_trans *trans;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return true;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->table == ctx->table &&
+ ((trans->msg_type == NFT_MSG_NEWCHAIN &&
+ nft_trans_chain_update(trans)) ||
+ (trans->msg_type == NFT_MSG_DELCHAIN &&
+ nft_is_base_chain(nft_trans_chain(trans)))))
+ return true;
+ }
+
+ return false;
}
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
u32 flags;
- int ret = 0;
+ int ret;
if (!ctx->nla[NFTA_TABLE_FLAGS])
return 0;
flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
- if (flags == ctx->table->flags)
+ if (flags == (ctx->table->flags & NFT_TABLE_F_MASK))
return 0;
+ if ((nft_table_has_owner(ctx->table) &&
+ !(flags & NFT_TABLE_F_OWNER)) ||
+ (flags & NFT_TABLE_F_OWNER &&
+ !nft_table_is_orphan(ctx->table)))
+ return -EOPNOTSUPP;
+
+ if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
+ return -EOPNOTSUPP;
+
+ /* No dormant off/on/off/on games in single transaction */
+ if (nft_table_pending_update(ctx))
+ return -EINVAL;
+
trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
sizeof(struct nft_trans_table));
if (trans == NULL)
@@ -891,22 +1448,35 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
+ ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ret = nf_tables_table_enable(ctx->net, ctx->table);
- if (ret >= 0) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
- nft_trans_table_enable(trans) = true;
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
+ ret = nf_tables_table_enable(ctx->net, ctx->table);
+ if (ret < 0)
+ goto err_register_hooks;
+
+ ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
}
}
- if (ret < 0)
- goto err;
+
+ if ((flags & NFT_TABLE_F_OWNER) &&
+ !nft_table_has_owner(ctx->table)) {
+ ctx->table->nlpid = ctx->portid;
+ ctx->table->flags |= NFT_TABLE_F_OWNER |
+ __NFT_TABLE_F_WAS_ORPHAN;
+ }
nft_trans_table_update(trans) = true;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
return 0;
-err:
+
+err_register_hooks:
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
nft_trans_destroy(trans);
return ret;
}
@@ -962,53 +1532,91 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
return strcmp(obj->key.name, k->name);
}
-static int nf_tables_newtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static bool nft_supported_family(u8 family)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ return false
+#ifdef CONFIG_NF_TABLES_INET
+ || family == NFPROTO_INET
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ || family == NFPROTO_IPV4
+#endif
+#ifdef CONFIG_NF_TABLES_ARP
+ || family == NFPROTO_ARP
+#endif
+#ifdef CONFIG_NF_TABLES_NETDEV
+ || family == NFPROTO_NETDEV
+#endif
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ || family == NFPROTO_BRIDGE
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ || family == NFPROTO_IPV6
+#endif
+ ;
+}
+
+static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- u32 flags = 0;
struct nft_ctx ctx;
+ u32 flags = 0;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ if (!nft_supported_family(family))
+ return -EOPNOTSUPP;
+
+ lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nf_tables_updtable(&ctx);
}
if (nla[NFTA_TABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
}
err = -ENOMEM;
- table = kzalloc(sizeof(*table), GFP_KERNEL);
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(attr, GFP_KERNEL);
+ table->validate_state = nft_net->validate_state;
+ table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
+ if (nla[NFTA_TABLE_USERDATA]) {
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (table->udata == NULL)
+ goto err_table_udata;
+
+ table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
+ }
+
err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;
@@ -1019,18 +1627,22 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
- table->handle = ++table_handle;
+ table->handle = ++nft_net->table_handle;
+ if (table->flags & NFT_TABLE_F_OWNER)
+ table->nlpid = NETLINK_CB(skb).portid;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err_trans;
- list_add_tail_rcu(&table->list, &net->nft.tables);
+ list_add_tail_rcu(&table->list, &nft_net->tables);
return 0;
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
+ kfree(table->udata);
+err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
@@ -1050,6 +1662,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delrule_by_chain(ctx);
@@ -1061,8 +1676,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, set))
continue;
- if (nft_set_is_anonymous(set) &&
- !list_empty(&set->bindings))
+ if (nft_set_is_anonymous(set))
continue;
err = nft_delset(ctx, set);
@@ -1092,6 +1706,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delchain(ctx);
@@ -1106,11 +1723,12 @@ out:
static int nft_flush(struct nft_ctx *ctx, int family)
{
- struct nft_table *table, *nt;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nlattr * const *nla = ctx->nla;
+ struct nft_table *table, *nt;
int err = 0;
- list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) {
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
if (family != AF_UNSPEC && table->family != family)
continue;
@@ -1119,6 +1737,9 @@ static int nft_flush(struct nft_ctx *ctx, int family)
if (!nft_is_active_next(ctx->net, table))
continue;
+ if (nft_table_has_owner(table) && table->nlpid != ctx->portid)
+ continue;
+
if (nla[NFTA_TABLE_NAME] &&
nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
continue;
@@ -1133,37 +1754,42 @@ out:
return err;
}
-static int nf_tables_deltable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
if (family == AF_UNSPEC ||
(!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
if (nla[NFTA_TABLE_HANDLE]) {
attr = nla[NFTA_TABLE_HANDLE];
- table = nft_table_lookup_byhandle(net, attr, genmask);
+ table = nft_table_lookup_byhandle(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
} else {
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
}
if (IS_ERR(table)) {
+ if (PTR_ERR(table) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(table);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
table->use > 0)
return -EBUSY;
@@ -1173,14 +1799,15 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
return nft_flush_table(&ctx);
}
-static void nf_tables_table_destroy(struct nft_ctx *ctx)
+static void nf_tables_table_destroy(struct nft_table *table)
{
- if (WARN_ON(ctx->table->use > 0))
+ if (WARN_ON(table->use > 0))
return;
- rhltable_destroy(&ctx->table->chains_ht);
- kfree(ctx->table->name);
- kfree(ctx->table);
+ rhltable_destroy(&table->chains_ht);
+ kfree(table->name);
+ kfree(table->udata);
+ kfree(table);
}
void nft_register_chain_type(const struct nft_chain_type *ctype)
@@ -1224,7 +1851,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
static bool lockdep_commit_lock_is_held(const struct net *net)
{
#ifdef CONFIG_PROVE_LOCKING
- return lockdep_is_held(&net->nft.commit_mutex);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ return lockdep_is_held(&nft_net->commit_mutex);
#else
return true;
#endif
@@ -1241,7 +1870,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
!lockdep_commit_lock_is_held(net));
@@ -1274,6 +1903,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
.len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
[NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
+ [NFTA_CHAIN_ID] = { .type = NLA_U32 },
+ [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1298,10 +1930,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
for_each_possible_cpu(cpu) {
cpu_stats = per_cpu_ptr(stats, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ seq = u64_stats_fetch_begin(&cpu_stats->syncp);
pkts = cpu_stats->pkts;
bytes = cpu_stats->bytes;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
total.pkts += pkts;
total.bytes += bytes;
}
@@ -1322,8 +1954,22 @@ nla_put_failure:
return -ENOSPC;
}
-static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
- const struct nft_base_chain *basechain)
+static bool hook_is_prefix(struct nft_hook *hook)
+{
+ return strlen(hook->ifname) >= hook->ifnamelen;
+}
+
+static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook)
+{
+ int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME;
+
+ return nla_put_string(skb, attr, hook->ifname);
+}
+
+static int nft_dump_basechain_hook(struct sk_buff *skb,
+ const struct net *net, int family,
+ const struct nft_base_chain *basechain,
+ const struct list_head *hook_list)
{
const struct nf_hook_ops *ops = &basechain->ops;
struct nft_hook *hook, *first = NULL;
@@ -1338,21 +1984,28 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
goto nla_put_failure;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, ops->hooknum)) {
nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
- list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (!nest_devs)
+ goto nla_put_failure;
+
+ if (!hook_list)
+ hook_list = &basechain->hook_list;
+
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
if (!first)
first = hook;
- if (nla_put_string(skb, NFTA_DEVICE_NAME,
- hook->ops.dev->name))
+ if (nft_nla_put_hook_dev(skb, hook))
goto nla_put_failure;
n++;
}
nla_nest_end(skb, nest_devs);
if (n == 1 &&
- nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+ !hook_is_prefix(first) &&
+ nla_put_string(skb, NFTA_HOOK_DEV, first->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -1365,34 +2018,35 @@ nla_put_failure:
static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event, u32 flags,
int family, const struct nft_table *table,
- const struct nft_chain *chain)
+ const struct nft_chain *chain,
+ const struct list_head *hook_list)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
- if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) ||
+ nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
NFTA_CHAIN_PAD))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
- goto nla_put_failure;
+
+ if (!hook_list &&
+ (event == NFT_MSG_DELCHAIN ||
+ event == NFT_MSG_DESTROYCHAIN)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
if (nft_is_base_chain(chain)) {
const struct nft_base_chain *basechain = nft_base_chain(chain);
struct nft_stats __percpu *stats;
- if (nft_dump_basechain_hook(skb, family, basechain))
+ if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1406,16 +2060,19 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
lockdep_commit_lock_is_held(net));
if (nft_dump_stats(skb, stats))
goto nla_put_failure;
-
- if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) &&
- nla_put_be32(skb, NFTA_CHAIN_FLAGS,
- htonl(NFT_CHAIN_HW_OFFLOAD)))
- goto nla_put_failure;
}
+ if (chain->flags &&
+ nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags)))
+ goto nla_put_failure;
+
if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
goto nla_put_failure;
+ if (chain->udata &&
+ nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1424,9 +2081,12 @@ nla_put_failure:
return -1;
}
-static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
+ const struct list_head *hook_list)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -1437,16 +2097,19 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
- ctx->chain);
+ event, flags, ctx->family, ctx->table,
+ ctx->chain, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -1456,16 +2119,18 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- const struct nft_chain *chain;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -1483,7 +2148,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
NFT_MSG_NEWCHAIN,
NLM_F_MULTI,
table->family, table,
- chain) < 0)
+ chain, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1498,29 +2163,28 @@ done:
}
/* called with rcu_read_lock held */
-static int nf_tables_getchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_chains,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -1537,14 +2201,14 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
- family, table, chain);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
+ 0, family, table, chain, NULL);
if (err < 0)
- goto err;
+ goto err_fill_chain_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_chain_info:
kfree_skb(skb2);
return err;
}
@@ -1564,14 +2228,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr,
nft_counter_policy, NULL);
if (err < 0)
- return ERR_PTR(err);
+ return ERR_PTR_PCPU(err);
if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
- return ERR_PTR(-EINVAL);
+ return ERR_PTR_PCPU(-EINVAL);
newstats = netdev_alloc_pcpu_stats(struct nft_stats);
if (newstats == NULL)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR_PCPU(-ENOMEM);
/* Restore old counters on this cpu, no problem. Per-cpu statistics
* are not exposed to userspace.
@@ -1585,38 +2249,39 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct nft_trans *trans)
+static void nft_chain_stats_replace(struct nft_trans_chain *trans)
{
- struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain);
+ const struct nft_trans *t = &trans->nft_trans_binding.nft_trans;
+ struct nft_base_chain *chain = nft_base_chain(trans->chain);
- if (!nft_trans_chain_stats(trans))
+ if (!trans->stats)
return;
- nft_trans_chain_stats(trans) =
- rcu_replace_pointer(chain->stats, nft_trans_chain_stats(trans),
- lockdep_commit_lock_is_held(trans->ctx.net));
+ trans->stats =
+ rcu_replace_pointer(chain->stats, trans->stats,
+ lockdep_commit_lock_is_held(t->net));
- if (!nft_trans_chain_stats(trans))
+ if (!trans->stats)
static_branch_inc(&nft_counters_enabled);
}
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
- struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
- struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+ struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
+ struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);
if (g0 != g1)
kvfree(g1);
kvfree(g0);
/* should be NULL either via abort or via successful commit */
- WARN_ON_ONCE(chain->rules_next);
- kvfree(chain->rules_next);
+ WARN_ON_ONCE(chain->blob_next);
+ kvfree(chain->blob_next);
}
-static void nf_tables_chain_destroy(struct nft_ctx *ctx)
+void nf_tables_chain_destroy(struct nft_chain *chain)
{
- struct nft_chain *chain = ctx->chain;
+ const struct nft_table *table = chain->table;
struct nft_hook *hook, *next;
if (WARN_ON(chain->use > 0))
@@ -1628,11 +2293,11 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
- if (ctx->family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
module_put(basechain->type->owner);
@@ -1641,41 +2306,57 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
free_percpu(rcu_dereference_raw(basechain->stats));
}
kfree(chain->name);
+ kfree(chain->udata);
kfree(basechain);
} else {
kfree(chain->name);
+ kfree(chain->udata);
kfree(chain);
}
}
static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
- const struct nlattr *attr)
+ const struct nlattr *attr,
+ bool prefix)
{
+ struct nf_hook_ops *ops;
struct net_device *dev;
- char ifname[IFNAMSIZ];
struct nft_hook *hook;
int err;
- hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
- if (!hook) {
- err = -ENOMEM;
- goto err_hook_alloc;
- }
+ hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
+ if (!hook)
+ return ERR_PTR(-ENOMEM);
- nla_strlcpy(ifname, attr, IFNAMSIZ);
- dev = __dev_get_by_name(net, ifname);
- if (!dev) {
- err = -ENOENT;
- goto err_hook_dev;
- }
- hook->ops.dev = dev;
- hook->inactive = false;
+ INIT_LIST_HEAD(&hook->ops_list);
+
+ err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
+ if (err < 0)
+ goto err_hook_free;
+ /* include the terminating NUL-char when comparing non-prefixes */
+ hook->ifnamelen = strlen(hook->ifname) + !prefix;
+
+ /* nf_tables_netdev_event() is called under rtnl_mutex, this is
+ * indirectly serializing all the other holders of the commit_mutex with
+ * the rtnl_mutex.
+ */
+ for_each_netdev(net, dev) {
+ if (strncmp(dev->name, hook->ifname, hook->ifnamelen))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
+ err = -ENOMEM;
+ goto err_hook_free;
+ }
+ ops->dev = dev;
+ list_add_tail(&ops->list, &hook->ops_list);
+ }
return hook;
-err_hook_dev:
- kfree(hook);
-err_hook_alloc:
+err_hook_free:
+ nft_netdev_hook_free(hook);
return ERR_PTR(err);
}
@@ -1685,7 +2366,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
- if (this->ops.dev == hook->ops.dev)
+ if (!strncmp(hook->ifname, this->ifname,
+ min(hook->ifnamelen, this->ifnamelen)))
return hook;
}
@@ -1694,25 +2376,36 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
static int nf_tables_parse_netdev_hooks(struct net *net,
const struct nlattr *attr,
- struct list_head *hook_list)
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack)
{
struct nft_hook *hook, *next;
const struct nlattr *tmp;
int rem, n = 0, err;
+ bool prefix;
nla_for_each_nested(tmp, attr, rem) {
- if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+ switch (nla_type(tmp)) {
+ case NFTA_DEVICE_NAME:
+ prefix = false;
+ break;
+ case NFTA_DEVICE_PREFIX:
+ prefix = true;
+ break;
+ default:
err = -EINVAL;
goto err_hook;
}
- hook = nft_netdev_hook_alloc(net, tmp);
+ hook = nft_netdev_hook_alloc(net, tmp, prefix);
if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tmp);
err = PTR_ERR(hook);
goto err_hook;
}
if (nft_hook_list_find(hook_list, hook)) {
- kfree(hook);
+ NL_SET_BAD_ATTR(extack, tmp);
+ nft_netdev_hook_free(hook);
err = -EEXIST;
goto err_hook;
}
@@ -1730,7 +2423,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
err_hook:
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
}
return err;
}
@@ -1742,44 +2435,48 @@ struct nft_chain_hook {
struct list_head list;
};
-static int nft_chain_parse_netdev(struct net *net,
- struct nlattr *tb[],
- struct list_head *hook_list)
+static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack, u32 flags)
{
struct nft_hook *hook;
int err;
if (tb[NFTA_HOOK_DEV]) {
- hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
- if (IS_ERR(hook))
+ hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false);
+ if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
return PTR_ERR(hook);
+ }
list_add_tail(&hook->list, hook_list);
} else if (tb[NFTA_HOOK_DEVS]) {
err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
- hook_list);
+ hook_list, extack);
if (err < 0)
return err;
- if (list_empty(hook_list))
- return -EINVAL;
- } else {
- return -EINVAL;
}
+ if (flags & NFT_CHAIN_HW_OFFLOAD &&
+ list_empty(hook_list))
+ return -EINVAL;
+
return 0;
}
static int nft_chain_parse_hook(struct net *net,
+ struct nft_base_chain *basechain,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- bool autoload)
+ u32 flags, struct netlink_ext_ack *extack)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX,
@@ -1788,38 +2485,69 @@ static int nft_chain_parse_hook(struct net *net,
if (err < 0)
return err;
- if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
- ha[NFTA_HOOK_PRIORITY] == NULL)
- return -EINVAL;
+ if (!basechain) {
+ if (!ha[NFTA_HOOK_HOOKNUM] ||
+ !ha[NFTA_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
+ return -ENOENT;
+ }
- hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
- hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
- type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
- if (!type)
- return -EOPNOTSUPP;
+ type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+ if (!type)
+ return -EOPNOTSUPP;
- if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
- family, autoload);
- if (IS_ERR(type))
- return PTR_ERR(type);
- }
- if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
- return -EOPNOTSUPP;
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+ family, true);
+ if (IS_ERR(type)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return PTR_ERR(type);
+ }
+ }
+ if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+ return -EOPNOTSUPP;
- if (type->type == NFT_CHAIN_T_NAT &&
- hook->priority <= NF_IP_PRI_CONNTRACK)
- return -EOPNOTSUPP;
+ if (type->type == NFT_CHAIN_T_NAT &&
+ hook->priority <= NF_IP_PRI_CONNTRACK)
+ return -EOPNOTSUPP;
+ } else {
+ if (ha[NFTA_HOOK_HOOKNUM]) {
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hook->num != basechain->ops.hooknum)
+ return -EOPNOTSUPP;
+ }
+ if (ha[NFTA_HOOK_PRIORITY]) {
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ if (hook->priority != basechain->ops.priority)
+ return -EOPNOTSUPP;
+ }
- if (!try_module_get(type->owner))
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+ family);
+ if (!type) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return -ENOENT;
+ }
+ } else {
+ type = basechain->type;
+ }
+ }
+
+ if (!try_module_get(type->owner)) {
+ if (nla[NFTA_CHAIN_TYPE])
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
return -ENOENT;
+ }
hook->type = type;
INIT_LIST_HEAD(&hook->list);
- if (family == NFPROTO_NETDEV) {
- err = nft_chain_parse_netdev(net, ha, &hook->list);
+ if (nft_base_chain_netdev(family, hook->num)) {
+ err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags);
if (err < 0) {
module_put(type->owner);
return err;
@@ -1838,99 +2566,133 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
list_for_each_entry_safe(h, next, &hook->list, list) {
list_del(&h->list);
- kfree(h);
+ nft_netdev_hook_free(h);
}
module_put(hook->type->owner);
}
-struct nft_rules_old {
- struct rcu_head h;
- struct nft_rule **start;
-};
+static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
+{
+ struct nft_rule_dp_last *lrule;
+
+ BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0);
+
+ lrule = (struct nft_rule_dp_last *)ptr;
+ lrule->end.is_last = 1;
+ lrule->chain = chain;
+ /* blob size does not include the trailer rule */
+}
-static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
- unsigned int alloc)
+static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+ unsigned int size)
{
- if (alloc > INT_MAX)
+ struct nft_rule_blob *blob;
+
+ if (size > INT_MAX)
return NULL;
- alloc += 1; /* NULL, ends rules */
- if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last);
+
+ blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!blob)
return NULL;
- alloc *= sizeof(struct nft_rule *);
- alloc += sizeof(struct nft_rules_old);
+ blob->size = 0;
+ nft_last_rule(chain, blob->data);
- return kvmalloc(alloc, GFP_KERNEL);
+ return blob;
}
static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
const struct nft_chain_hook *hook,
struct nft_chain *chain)
{
- ops->pf = family;
- ops->hooknum = hook->num;
- ops->priority = hook->priority;
- ops->priv = chain;
- ops->hook = hook->type->hooks[ops->hooknum];
+ ops->pf = family;
+ ops->hooknum = hook->num;
+ ops->priority = hook->priority;
+ ops->priv = chain;
+ ops->hook = hook->type->hooks[ops->hooknum];
+ ops->hook_ops_type = NF_HOOK_OP_NF_TABLES;
}
static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
struct nft_chain_hook *hook, u32 flags)
{
struct nft_chain *chain;
+ struct nf_hook_ops *ops;
struct nft_hook *h;
basechain->type = hook->type;
INIT_LIST_HEAD(&basechain->hook_list);
chain = &basechain->chain;
- if (family == NFPROTO_NETDEV) {
+ if (nft_base_chain_netdev(family, hook->num)) {
list_splice_init(&hook->list, &basechain->hook_list);
- list_for_each_entry(h, &basechain->hook_list, list)
- nft_basechain_hook_init(&h->ops, family, hook, chain);
-
- basechain->ops.hooknum = hook->num;
- basechain->ops.priority = hook->priority;
- } else {
- nft_basechain_hook_init(&basechain->ops, family, hook, chain);
+ list_for_each_entry(h, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nft_basechain_hook_init(ops, family, hook, chain);
+ }
}
+ nft_basechain_hook_init(&basechain->ops, family, hook, chain);
- chain->flags |= NFT_BASE_CHAIN | flags;
+ chain->flags |= NFT_CHAIN_BASE | flags;
basechain->policy = NF_ACCEPT;
if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
- nft_chain_offload_priority(basechain) < 0)
+ !nft_chain_offload_support(basechain)) {
+ list_splice_init(&basechain->hook_list, &hook->list);
return -EOPNOTSUPP;
+ }
flow_block_init(&basechain->flow_block);
return 0;
}
-static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, u32 flags)
+int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
+{
+ int err;
+
+ err = rhltable_insert_key(&table->chains_ht, chain->name,
+ &chain->rhlhead, nft_chain_ht_params);
+ if (err)
+ return err;
+
+ list_add_tail_rcu(&chain->list, &table->chains);
+
+ return 0;
+}
+
+static u64 chain_id;
+
+static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy,
+ u32 flags, struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
struct nft_base_chain *basechain;
- struct nft_stats __percpu *stats;
struct net *net = ctx->net;
+ char name[NFT_NAME_MAXLEN];
+ struct nft_rule_blob *blob;
struct nft_trans *trans;
struct nft_chain *chain;
- struct nft_rule **rules;
int err;
- if (table->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_CHAIN_HOOK]) {
- struct nft_chain_hook hook;
+ struct nft_stats __percpu *stats = NULL;
+ struct nft_chain_hook hook = {};
- err = nft_chain_parse_hook(net, nla, &hook, family, true);
+ if (table->flags & __NFT_TABLE_F_UPDATE)
+ return -EINVAL;
+
+ if (flags & NFT_CHAIN_BINDING)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags,
+ extack);
if (err < 0)
return err;
- basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
if (basechain == NULL) {
nft_chain_release_hook(&hook);
return -ENOMEM;
@@ -1939,145 +2701,192 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats)) {
+ if (IS_ERR_PCPU(stats)) {
nft_chain_release_hook(&hook);
kfree(basechain);
- return PTR_ERR(stats);
+ return PTR_ERR_PCPU(stats);
}
rcu_assign_pointer(basechain->stats, stats);
- static_branch_inc(&nft_counters_enabled);
}
err = nft_basechain_init(basechain, family, &hook, flags);
if (err < 0) {
nft_chain_release_hook(&hook);
kfree(basechain);
+ free_percpu(stats);
return err;
}
+ if (stats)
+ static_branch_inc(&nft_counters_enabled);
} else {
- chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (flags & NFT_CHAIN_BASE)
+ return -EINVAL;
+ if (flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
+
+ chain->flags = flags;
}
ctx->chain = chain;
INIT_LIST_HEAD(&chain->rules);
chain->handle = nf_tables_alloc_handle(table);
chain->table = table;
- chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+
+ if (nla[NFTA_CHAIN_NAME]) {
+ chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
+ } else {
+ if (!(flags & NFT_CHAIN_BINDING)) {
+ err = -EINVAL;
+ goto err_destroy_chain;
+ }
+
+ snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
+ chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
+ }
+
if (!chain->name) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
}
- rules = nf_tables_chain_alloc_rules(chain, 0);
- if (!rules) {
- err = -ENOMEM;
- goto err1;
+ if (nla[NFTA_CHAIN_USERDATA]) {
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (chain->udata == NULL) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
+ chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
- *rules = NULL;
- rcu_assign_pointer(chain->rules_gen_0, rules);
- rcu_assign_pointer(chain->rules_gen_1, rules);
+ blob = nf_tables_chain_alloc_rules(chain, 0);
+ if (!blob) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
- err = nf_tables_register_hook(net, table, chain);
- if (err < 0)
- goto err1;
+ RCU_INIT_POINTER(chain->blob_gen_0, blob);
+ RCU_INIT_POINTER(chain->blob_gen_1, blob);
- err = rhltable_insert_key(&table->chains_ht, chain->name,
- &chain->rhlhead, nft_chain_ht_params);
- if (err)
- goto err2;
+ if (!nft_use_inc(&table->use)) {
+ err = -EMFILE;
+ goto err_destroy_chain;
+ }
trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- rhltable_remove(&table->chains_ht, &chain->rhlhead,
- nft_chain_ht_params);
- goto err2;
+ goto err_trans;
}
nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
if (nft_is_base_chain(chain))
nft_trans_chain_policy(trans) = policy;
- table->use++;
- list_add_tail_rcu(&chain->list, &table->chains);
-
- return 0;
-err2:
- nf_tables_unregister_hook(net, table, chain);
-err1:
- nf_tables_chain_destroy(ctx);
-
- return err;
-}
+ err = nft_chain_add(table, chain);
+ if (err < 0)
+ goto err_chain_add;
-static bool nft_hook_list_equal(struct list_head *hook_list1,
- struct list_head *hook_list2)
-{
- struct nft_hook *hook;
- int n = 0, m = 0;
+ /* This must be LAST to ensure no packets are walking over this chain. */
+ err = nf_tables_register_hook(net, table, chain);
+ if (err < 0)
+ goto err_register_hook;
- n = 0;
- list_for_each_entry(hook, hook_list2, list) {
- if (!nft_hook_list_find(hook_list1, hook))
- return false;
+ return 0;
- n++;
- }
- list_for_each_entry(hook, hook_list1, list)
- m++;
+err_register_hook:
+ nft_chain_del(chain);
+err_chain_add:
+ nft_trans_destroy(trans);
+err_trans:
+ nft_use_dec_restore(&table->use);
+err_destroy_chain:
+ nf_tables_chain_destroy(chain);
- return n == m;
+ return err;
}
static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
- u32 flags)
+ u32 flags, const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
+ struct nft_base_chain *basechain = NULL;
struct nft_table *table = ctx->table;
struct nft_chain *chain = ctx->chain;
- struct nft_base_chain *basechain;
- struct nft_stats *stats = NULL;
- struct nft_chain_hook hook;
+ struct nft_chain_hook hook = {};
+ struct nft_stats __percpu *stats = NULL;
+ struct nftables_pernet *nft_net;
+ struct nft_hook *h, *next;
struct nf_hook_ops *ops;
struct nft_trans *trans;
+ bool unregister = false;
int err;
if (chain->flags ^ flags)
return -EOPNOTSUPP;
+ INIT_LIST_HEAD(&hook.list);
+
if (nla[NFTA_CHAIN_HOOK]) {
- if (!nft_is_base_chain(chain))
- return -EBUSY;
+ if (!nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
+ }
- err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- false);
+ basechain = nft_base_chain(chain);
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
+ ctx->family, flags, extack);
if (err < 0)
return err;
- basechain = nft_base_chain(chain);
if (basechain->type != hook.type) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
- if (ctx->family == NFPROTO_NETDEV) {
- if (!nft_hook_list_equal(&basechain->hook_list,
- &hook.list)) {
- nft_chain_release_hook(&hook);
- return -EBUSY;
+ if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ list_for_each_entry(ops, &h->ops_list, list) {
+ ops->pf = basechain->ops.pf;
+ ops->hooknum = basechain->ops.hooknum;
+ ops->priority = basechain->ops.priority;
+ ops->priv = basechain->ops.priv;
+ ops->hook = basechain->ops.hook;
+ }
+
+ if (nft_hook_list_find(&basechain->hook_list, h)) {
+ list_del(&h->list);
+ nft_netdev_hook_free(h);
+ continue;
+ }
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWCHAIN ||
+ trans->table != ctx->table ||
+ !nft_trans_chain_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
+ nft_chain_release_hook(&hook);
+ return -EEXIST;
+ }
+ }
}
} else {
ops = &basechain->ops;
if (ops->hooknum != hook.num ||
ops->priority != hook.priority) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
}
- nft_chain_release_hook(&hook);
}
if (nla[NFTA_CHAIN_HANDLE] &&
@@ -2086,24 +2895,52 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
chain2 = nft_chain_lookup(ctx->net, table,
nla[NFTA_CHAIN_NAME], genmask);
- if (!IS_ERR(chain2))
- return -EEXIST;
+ if (!IS_ERR(chain2)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
+ err = -EEXIST;
+ goto err_hooks;
+ }
+ }
+
+ if (table->flags & __NFT_TABLE_F_UPDATE &&
+ !list_empty(&hook.list)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nft_is_base_chain(chain) &&
+ !list_empty(&hook.list)) {
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ err = nft_netdev_register_hooks(ctx->net, &hook.list);
+ if (err < 0)
+ goto err_hooks;
+
+ unregister = true;
+ }
}
if (nla[NFTA_CHAIN_COUNTERS]) {
- if (!nft_is_base_chain(chain))
- return -EOPNOTSUPP;
+ if (!nft_is_base_chain(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats))
- return PTR_ERR(stats);
+ if (IS_ERR_PCPU(stats)) {
+ err = PTR_ERR_PCPU(stats);
+ goto err_hooks;
+ }
}
err = -ENOMEM;
- trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
- sizeof(struct nft_trans_chain));
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN);
if (trans == NULL)
- goto err;
+ goto err_trans;
nft_trans_chain_stats(trans) = stats;
nft_trans_chain_update(trans) = true;
@@ -2115,56 +2952,98 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
if (nla[NFTA_CHAIN_HANDLE] &&
nla[NFTA_CHAIN_NAME]) {
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_trans *tmp;
char *name;
err = -ENOMEM;
- name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
- goto err;
+ goto err_trans;
err = -EEXIST;
- list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
+ list_for_each_entry(tmp, &nft_net->commit_list, list) {
if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
- tmp->ctx.table == table &&
+ tmp->table == table &&
nft_trans_chain_update(tmp) &&
nft_trans_chain_name(tmp) &&
strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
kfree(name);
- goto err;
+ goto err_trans;
}
}
nft_trans_chain_name(trans) = name;
}
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ nft_trans_basechain(trans) = basechain;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&hook.list, &nft_trans_chain_hooks(trans));
+ if (nla[NFTA_CHAIN_HOOK])
+ module_put(hook.type->owner);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
-err:
+
+err_trans:
free_percpu(stats);
kfree(trans);
+err_hooks:
+ if (nla[NFTA_CHAIN_HOOK]) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ if (unregister) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nf_unregister_net_hook(ctx->net, ops);
+ }
+ list_del(&h->list);
+ nft_netdev_hook_free_rcu(h);
+ }
+ module_put(hook.type->owner);
+ }
+
return err;
}
-static int nf_tables_newchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
+ const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans *trans;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWCHAIN &&
+ nft_trans_chain(trans)->table == table &&
+ id == nft_trans_chain_id(trans) &&
+ nft_active_genmask(nft_trans_chain(trans), genmask))
+ return nft_trans_chain(trans);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_chain *chain = NULL;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- struct nft_chain *chain;
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
u32 flags = 0;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -2181,7 +3060,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
return PTR_ERR(chain);
}
attr = nla[NFTA_CHAIN_HANDLE];
- } else {
+ } else if (nla[NFTA_CHAIN_NAME]) {
chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
@@ -2190,6 +3069,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
chain = NULL;
}
+ } else if (!nla[NFTA_CHAIN_ID]) {
+ return -EINVAL;
}
if (nla[NFTA_CHAIN_POLICY]) {
@@ -2220,31 +3101,89 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
else if (chain)
flags = chain->flags;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ if (flags & ~NFT_CHAIN_FLAGS)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain != NULL) {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- flags |= chain->flags & NFT_BASE_CHAIN;
- return nf_tables_updchain(&ctx, genmask, policy, flags);
+ flags |= chain->flags & NFT_CHAIN_BASE;
+ return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
+ extack);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, flags);
+ return nf_tables_addchain(&ctx, family, policy, flags, extack);
}
-static int nf_tables_delchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nft_delchain_hook(struct nft_ctx *ctx,
+ struct nft_base_chain *basechain,
+ struct netlink_ext_ack *extack)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ const struct nft_chain *chain = &basechain->chain;
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_chain_hook chain_hook = {};
+ struct nft_hook *this, *hook;
+ LIST_HEAD(chain_del_list);
+ struct nft_trans *trans;
+ int err;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook,
+ ctx->family, chain->flags, extack);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry(this, &chain_hook.list, list) {
+ hook = nft_hook_list_find(&basechain->hook_list, this);
+ if (!hook) {
+ err = -ENOENT;
+ goto err_chain_del_hook;
+ }
+ list_move(&hook->list, &chain_del_list);
+ }
+
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_chain_del_hook;
+ }
+
+ nft_trans_basechain(trans) = basechain;
+ nft_trans_chain_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
+ nft_chain_release_hook(&chain_hook);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_chain_del_hook:
+ list_splice(&chain_del_list, &basechain->hook_list);
+ nft_chain_release_hook(&chain_hook);
+
+ return err;
+}
+
+static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
@@ -2254,7 +3193,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
u32 use;
int err;
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -2269,16 +3209,36 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup(net, table, attr, genmask);
}
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (nft_chain_binding(chain))
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN ||
+ chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ if (nft_is_base_chain(chain)) {
+ struct nft_base_chain *basechain = nft_base_chain(chain);
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_delchain_hook(&ctx, basechain, extack);
+ }
+ }
+
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
-
use = chain->use;
list_for_each_entry(rule, &chain->rules, list) {
if (!nft_is_active_next(net, rule))
@@ -2307,13 +3267,16 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
/**
* nft_register_expr - register nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Registers the expr type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
*/
int nft_register_expr(struct nft_expr_type *type)
{
+ if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
+ return -ENOMEM;
+
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
list_add_tail_rcu(&type->list, &nf_tables_expressions);
@@ -2326,7 +3289,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr);
/**
* nft_unregister_expr - unregister nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Unregisters the expr typefor use with nf_tables.
*/
@@ -2343,7 +3306,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
{
const struct nft_expr_type *type, *candidate = NULL;
- list_for_each_entry(type, &nf_tables_expressions, list) {
+ list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
if (!nla_strcmp(nla, type->name)) {
if (!type->family && !candidate)
candidate = type;
@@ -2375,9 +3338,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
+ rcu_read_lock();
type = __nft_expr_type_get(family, nla);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -2401,7 +3368,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
};
static int nf_tables_fill_expr_info(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
goto nla_put_failure;
@@ -2411,7 +3378,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb,
NFTA_EXPR_DATA);
if (data == NULL)
goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
+ if (expr->ops->dump(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, data);
}
@@ -2423,14 +3390,14 @@ nla_put_failure:
};
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
struct nlattr *nest;
nest = nla_nest_start_noflag(skb, attr);
if (!nest)
goto nla_put_failure;
- if (nf_tables_fill_expr_info(skb, expr) < 0)
+ if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return 0;
@@ -2441,6 +3408,7 @@ nla_put_failure:
struct nft_expr_info {
const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
};
@@ -2488,7 +3456,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
} else
ops = type->ops;
+ info->attr = nla;
info->ops = ops;
+
return 0;
err1:
@@ -2496,16 +3466,65 @@ err1:
return err;
}
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ const struct nft_expr_type *type;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+ nft_expr_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME])
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ if (!type) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (!type->inner_ops) {
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA],
+ type->policy, NULL);
+ if (err < 0)
+ goto out_unlock;
+
+ info->attr = nla;
+ info->ops = type->inner_ops;
+
+ /* No module reference will be taken on type->owner.
+ * Presence of type->inner_ops implies that the expression
+ * is builtin, so it cannot go away.
+ */
+ rcu_read_unlock();
+ return 0;
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
static int nf_tables_newexpr(const struct nft_ctx *ctx,
- const struct nft_expr_info *info,
+ const struct nft_expr_info *expr_info,
struct nft_expr *expr)
{
- const struct nft_expr_ops *ops = info->ops;
+ const struct nft_expr_ops *ops = expr_info->ops;
int err;
expr->ops = ops;
if (ops->init) {
- err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
if (err < 0)
goto err1;
}
@@ -2529,49 +3548,52 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
const struct nlattr *nla)
{
- struct nft_expr_info info;
+ struct nft_expr_info expr_info;
struct nft_expr *expr;
struct module *owner;
int err;
- err = nf_tables_expr_parse(ctx, nla, &info);
+ err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
- goto err1;
+ goto err_expr_parse;
+
+ err = -EOPNOTSUPP;
+ if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_expr_stateful;
err = -ENOMEM;
- expr = kzalloc(info.ops->size, GFP_KERNEL);
+ expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
if (expr == NULL)
- goto err2;
+ goto err_expr_stateful;
- err = nf_tables_newexpr(ctx, &info, expr);
+ err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
- goto err3;
+ goto err_expr_new;
return expr;
-err3:
+err_expr_new:
kfree(expr);
-err2:
- owner = info.ops->type->owner;
- if (info.ops->type->release_ops)
- info.ops->type->release_ops(info.ops);
+err_expr_stateful:
+ owner = expr_info.ops->type->owner;
+ if (expr_info.ops->type->release_ops)
+ expr_info.ops->type->release_ops(expr_info.ops);
module_put(owner);
-err1:
+err_expr_parse:
return ERR_PTR(err);
}
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
int err;
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
- }
+ if (WARN_ON_ONCE(!src->ops->clone))
+ return -EINVAL;
+
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src, gfp);
+ if (err < 0)
+ return err;
__module_get(src->ops->type->owner);
@@ -2588,13 +3610,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
* Rules
*/
-static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *__nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
u64 handle)
{
struct nft_rule *rule;
// FIXME: this sucks
- list_for_each_entry_rcu(rule, &chain->rules, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list,
+ lockdep_commit_lock_is_held(net)) {
if (handle == rule->handle)
return rule;
}
@@ -2602,13 +3626,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
return ERR_PTR(-ENOENT);
}
-static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
if (nla == NULL)
return ERR_PTR(-EINVAL);
- return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+ return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla)));
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2617,13 +3642,14 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
- [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
[NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
[NFTA_RULE_POSITION] = { .type = NLA_U64 },
[NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_RULE_ID] = { .type = NLA_U32 },
[NFTA_RULE_POSITION_ID] = { .type = NLA_U32 },
+ [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 },
};
static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
@@ -2631,24 +3657,19 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u32 flags, int family,
const struct nft_table *table,
const struct nft_chain *chain,
- const struct nft_rule *rule,
- const struct nft_rule *prule)
+ const struct nft_rule *rule, u64 handle,
+ bool reset)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
const struct nft_expr *expr, *next;
struct nlattr *list;
u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
+ nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
@@ -2657,18 +3678,20 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
NFTA_RULE_PAD))
goto nla_put_failure;
- if (event != NFT_MSG_DELRULE && prule) {
- if (nla_put_be64(skb, NFTA_RULE_POSITION,
- cpu_to_be64(prule->handle),
+ if (event != NFT_MSG_DELRULE && handle) {
+ if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle),
NFTA_RULE_PAD))
goto nla_put_failure;
}
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_stats(chain, rule);
+
list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS);
if (list == NULL)
goto nla_put_failure;
nft_rule_for_each_expr(expr, next, rule) {
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, list);
@@ -2691,7 +3714,11 @@ nla_put_failure:
static void nf_tables_rule_notify(const struct nft_ctx *ctx,
const struct nft_rule *rule, int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ const struct nft_rule *prule;
struct sk_buff *skb;
+ u64 handle = 0;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -2702,24 +3729,48 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (event == NFT_MSG_NEWRULE &&
+ !list_is_first(&rule->list, &ctx->chain->rules) &&
+ !list_is_last(&rule->list, &ctx->chain->rules)) {
+ prule = list_prev_entry(rule, list);
+ handle = prule->handle;
+ }
+ if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE))
+ flags |= NLM_F_APPEND;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
- ctx->chain, rule, NULL);
+ event, flags, ctx->family, ctx->table,
+ ctx->chain, rule, handle, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+static void audit_log_rule_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+ table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
struct nft_rule_dump_ctx {
+ unsigned int s_idx;
char *table;
char *chain;
+ bool reset;
};
static int __nf_tables_dump_rules(struct sk_buff *skb,
@@ -2728,59 +3779,71 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
const struct nft_table *table,
const struct nft_chain *chain)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
const struct nft_rule *rule, *prule;
- unsigned int s_idx = cb->args[0];
+ unsigned int entries = 0;
+ int ret = 0;
+ u64 handle;
prule = NULL;
list_for_each_entry_rcu(rule, &chain->rules, list) {
if (!nft_is_active(net, rule))
goto cont_skip;
- if (*idx < s_idx)
+ if (*idx < ctx->s_idx)
goto cont;
- if (*idx > s_idx) {
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- }
+ if (prule)
+ handle = prule->handle;
+ else
+ handle = 0;
+
if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule, prule) < 0)
- return 1;
-
+ table, chain, rule, handle, ctx->reset) < 0) {
+ ret = 1;
+ break;
+ }
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
prule = rule;
cont_skip:
(*idx)++;
}
- return 0;
+
+ if (ctx->reset && entries)
+ audit_log_rule_reset(table, cb->seq, entries);
+
+ return ret;
}
static int nf_tables_dump_rules(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct nft_table *table;
const struct nft_chain *chain;
unsigned int idx = 0;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
- if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
+ if (ctx->table && strcmp(ctx->table, table->name) != 0)
continue;
- if (ctx && ctx->table && ctx->chain) {
+ if (ctx->table && ctx->chain) {
struct rhlist_head *list, *tmp;
list = rhltable_lookup(&table->chains_ht, ctx->chain,
@@ -2799,129 +3862,198 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
}
list_for_each_entry_rcu(chain, &table->chains, list) {
- if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+ if (__nf_tables_dump_rules(skb, &idx,
+ cb, table, chain))
goto done;
}
- if (ctx && ctx->table)
+ if (ctx->table)
break;
}
done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ /* Mutex is held is to prevent that two concurrent dump-and-reset calls
+ * do not underrun counters and quotas. The commit_mutex is used for
+ * the lack a better lock, this is not transaction path.
+ */
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_rules(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_rules_start(struct netlink_callback *cb)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_rule_dump_ctx *ctx = NULL;
- if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
- ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
- if (!ctx)
- return -ENOMEM;
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_RULE_TABLE]) {
- ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_ATOMIC);
- if (!ctx->table) {
- kfree(ctx);
- return -ENOMEM;
- }
- }
- if (nla[NFTA_RULE_CHAIN]) {
- ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_ATOMIC);
- if (!ctx->chain) {
- kfree(ctx->table);
- kfree(ctx);
- return -ENOMEM;
- }
+ if (nla[NFTA_RULE_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
+ }
+ if (nla[NFTA_RULE_CHAIN]) {
+ ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
+ if (!ctx->chain) {
+ kfree(ctx->table);
+ return -ENOMEM;
}
}
-
- cb->data = ctx;
return 0;
}
+static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
+{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_rules_start(cb);
+}
+
static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
- struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
- if (ctx) {
- kfree(ctx->table);
- kfree(ctx->chain);
- kfree(ctx);
- }
+ kfree(ctx->table);
+ kfree(ctx->chain);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
const struct nft_rule *rule;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start= nf_tables_dump_rules_start,
- .dump = nf_tables_dump_rules,
- .done = nf_tables_dump_rules_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
- }
-
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ return ERR_CAST(chain);
}
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
- return PTR_ERR(rule);
+ return ERR_CAST(rule);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
- family, table, chain, rule, NULL);
- if (err < 0)
- goto err;
+ err = nf_tables_fill_rule_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule, 0, reset);
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return skb2;
+}
-err:
- kfree_skb(skb2);
- return err;
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dump_rules_start,
+ .dump = nf_tables_dump_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ skb2 = nf_tables_getrule_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
-static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+static int nf_tables_getrule_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dumpreset_rules_start,
+ .dump = nf_tables_dumpreset_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getrule_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_RULE_TABLE]),
+ (char *)nla_data(nla[NFTA_RULE_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
+}
+
+void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr, *next;
@@ -2930,7 +4062,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
next = nft_expr_next(expr);
nf_tables_expr_destroy(ctx, expr);
expr = next;
@@ -2938,17 +4070,27 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
kfree(rule);
}
-static void nf_tables_rule_release(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+/* can only be used if rule is no longer visible to dumps */
+static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
nf_tables_rule_destroy(ctx, rule);
}
+/** nft_chain_validate - loop detection and hook validation
+ *
+ * @ctx: context containing call depth and base chain
+ * @chain: chain to validate
+ *
+ * Walk through the rules of the given chain and chase all jumps/gotos
+ * and set lookups until either the jump limit is hit or all reachable
+ * chains have been validated.
+ */
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
struct nft_expr *expr, *last;
- const struct nft_data *data;
struct nft_rule *rule;
int err;
@@ -2956,6 +4098,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
return -EMLINK;
list_for_each_entry(rule, &chain->rules, list) {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
if (!nft_is_active_next(ctx->net, rule))
continue;
@@ -2963,7 +4108,10 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (!expr->ops->validate)
continue;
- err = expr->ops->validate(ctx, expr, &data);
+ /* This may call nft_chain_validate() recursively,
+ * callers that do so must increment ctx->level.
+ */
+ err = expr->ops->validate(ctx, expr);
if (err < 0)
return err;
}
@@ -2990,86 +4138,160 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
err = nft_chain_validate(&ctx, chain);
if (err < 0)
return err;
+
+ cond_resched();
}
return 0;
}
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+ const struct nft_data *data;
+ int err;
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ pctx->level++;
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ pctx->level--;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter dummy_iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ };
+ struct nft_set_elem_catchall *catchall;
+
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, dummy_iter.genmask))
+ continue;
+
+ ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+}
+
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla);
#define NFT_RULE_MAXEXPRS 128
-static int nf_tables_newrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_expr_info *info = NULL;
- int family = nfmsg->nfgen_family;
- struct nft_flow_rule *flow;
- struct nft_table *table;
- struct nft_chain *chain;
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ unsigned int size, i, n, ulen = 0, usize = 0;
+ u8 genmask = nft_genmask_next(info->net);
struct nft_rule *rule, *old_rule = NULL;
+ struct nft_expr_info *expr_info = NULL;
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_flow_rule *flow = NULL;
+ struct net *net = info->net;
struct nft_userdata *udata;
- struct nft_trans *trans = NULL;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_trans *trans;
+ u64 handle, pos_handle;
struct nft_expr *expr;
struct nft_ctx ctx;
struct nlattr *tmp;
- unsigned int size, i, n, ulen = 0, usize = 0;
int err, rem;
- u64 handle, pos_handle;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
}
- chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+ genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
+ return PTR_ERR(chain);
+ }
+
+ } else if (nla[NFTA_RULE_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
+ genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
+ return PTR_ERR(chain);
+ }
+ } else {
+ return -EINVAL;
}
+ if (nft_chain_is_bound(chain))
+ return -EOPNOTSUPP;
+
if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
- rule = __nft_rule_lookup(chain, handle);
+ rule = __nft_rule_lookup(net, chain, handle);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
return -EOPNOTSUPP;
} else {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
- nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
+ info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);
- if (chain->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_RULE_POSITION]) {
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nft_rule_lookup(chain, pos_handle);
+ old_rule = __nft_rule_lookup(net, chain, pos_handle);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
}
} else if (nla[NFTA_RULE_POSITION_ID]) {
- old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]);
+ old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
return PTR_ERR(old_rule);
@@ -3077,34 +4299,36 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) {
- info = kvmalloc_array(NFT_RULE_MAXEXPRS,
- sizeof(struct nft_expr_info),
- GFP_KERNEL);
- if (!info)
+ expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+ sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (!expr_info)
return -ENOMEM;
nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
- goto err1;
+ goto err_release_expr;
if (n == NFT_RULE_MAXEXPRS)
- goto err1;
- err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
- if (err < 0)
- goto err1;
- size += info[n].ops->size;
+ goto err_release_expr;
+ err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, tmp);
+ goto err_release_expr;
+ }
+ size += expr_info[n].ops->size;
n++;
}
}
/* Check for overflow of dlen field */
err = -EFBIG;
if (size >= 1 << 12)
- goto err1;
+ goto err_release_expr;
if (nla[NFTA_RULE_USERDATA]) {
ulen = nla_len(nla[NFTA_RULE_USERDATA]);
@@ -3113,9 +4337,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
err = -ENOMEM;
- rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
if (rule == NULL)
- goto err1;
+ goto err_release_expr;
nft_activate_next(net, rule);
@@ -3131,38 +4355,56 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
expr = nft_expr_first(rule);
for (i = 0; i < n; i++) {
- err = nf_tables_newexpr(&ctx, &info[i], expr);
- if (err < 0)
- goto err2;
+ err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, expr_info[i].attr);
+ goto err_release_rule;
+ }
- if (info[i].ops->validate)
- nft_validate_state_update(net, NFT_VALIDATE_NEED);
+ if (expr_info[i].ops->validate)
+ nft_validate_state_update(table, NFT_VALIDATE_NEED);
- info[i].ops = NULL;
+ expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(net, rule);
+ if (IS_ERR(flow)) {
+ err = PTR_ERR(flow);
+ goto err_release_rule;
+ }
+ }
+
+ if (!nft_use_inc(&chain->use)) {
+ err = -EMFILE;
+ goto err_release_rule;
+ }
+
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_chain_binding(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_destroy_flow_rule;
+ }
+
+ err = nft_delrule(&ctx, old_rule);
+ if (err < 0)
+ goto err_destroy_flow_rule;
+
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- err = nft_delrule(&ctx, old_rule);
- if (err < 0) {
- nft_trans_destroy(trans);
- goto err2;
- }
-
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (!trans) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- if (nlh->nlmsg_flags & NLM_F_APPEND) {
+ if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
if (old_rule)
list_add_rcu(&rule->list, &old_rule->list);
else
@@ -3174,65 +4416,68 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_rcu(&rule->list, &chain->rules);
}
}
- kvfree(info);
- chain->use++;
-
- if (net->nft.validate_state == NFT_VALIDATE_DO)
- return nft_table_validate(net, table);
-
- if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
- flow = nft_flow_rule_create(net, rule);
- if (IS_ERR(flow))
- return PTR_ERR(flow);
+ kvfree(expr_info);
+ if (flow)
nft_trans_flow_rule(trans) = flow;
- }
+
+ if (table->validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
return 0;
-err2:
- nf_tables_rule_release(&ctx, rule);
-err1:
+
+err_destroy_flow_rule:
+ nft_use_dec_restore(&chain->use);
+ if (flow)
+ nft_flow_rule_destroy(flow);
+err_release_rule:
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
+ nf_tables_rule_destroy(&ctx, rule);
+err_release_expr:
for (i = 0; i < n; i++) {
- if (info[i].ops) {
- module_put(info[i].ops->type->owner);
- if (info[i].ops->type->release_ops)
- info[i].ops->type->release_ops(info[i].ops);
+ if (expr_info[i].ops) {
+ module_put(expr_info[i].ops->type->owner);
+ if (expr_info[i].ops->type->release_ops)
+ expr_info[i].ops->type->release_ops(expr_info[i].ops);
}
}
- kvfree(info);
+ kvfree(expr_info);
+
return err;
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
struct nft_trans *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- struct nft_rule *rule = nft_trans_rule(trans);
-
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
if (trans->msg_type == NFT_MSG_NEWRULE &&
+ nft_trans_rule_chain(trans) == chain &&
id == nft_trans_rule_id(trans))
- return rule;
+ return nft_trans_rule(trans);
}
return ERR_PTR(-ENOENT);
}
-static int nf_tables_delrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_table *table;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_chain *chain = NULL;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_rule *rule;
- int family = nfmsg->nfgen_family, err = 0;
struct nft_ctx ctx;
+ int err = 0;
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
@@ -3242,24 +4487,34 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
genmask);
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
+ if (nft_chain_binding(chain))
+ return -EOPNOTSUPP;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
+ if (PTR_ERR(rule) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
- rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+ rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
@@ -3273,6 +4528,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_active_next(net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
ctx.chain = chain;
err = nft_delrule_by_chain(&ctx);
@@ -3314,23 +4571,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
* given, in that case the amount of memory per element is used.
*/
static const struct nft_set_ops *
-nft_select_set_ops(const struct nft_ctx *ctx,
- const struct nlattr * const nla[],
- const struct nft_set_desc *desc,
- enum nft_set_policies policy)
+nft_select_set_ops(const struct nft_ctx *ctx, u32 flags,
+ const struct nft_set_desc *desc)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nft_set_ops *ops, *bops;
struct nft_set_estimate est, best;
const struct nft_set_type *type;
- u32 flags = 0;
int i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
- if (nla[NFTA_SET_FLAGS] != NULL)
- flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
-
bops = NULL;
best.size = ~0;
best.lookup = ~0;
@@ -3345,7 +4597,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
if (!ops->estimate(desc, flags, &est))
continue;
- switch (policy) {
+ switch (desc->policy) {
case NFT_SET_POL_PERFORMANCE:
if (est.lookup < best.lookup)
break;
@@ -3398,38 +4650,22 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
[NFTA_SET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
+ [NFTA_SET_TYPE] = { .type = NLA_REJECT },
+ [NFTA_SET_COUNT] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
+ [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
- [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED },
+ [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
-{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table = NULL;
-
- if (nla[NFTA_SET_TABLE] != NULL) {
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
- return PTR_ERR(table);
- }
- }
-
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
- return 0;
-}
-
-static struct nft_set *nft_set_lookup(const struct nft_table *table,
+static struct nft_set *nft_set_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
@@ -3437,7 +4673,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(set, &table->sets, list) {
+ list_for_each_entry_rcu(set, &table->sets, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, set->name) &&
nft_active_genmask(set, genmask))
return set;
@@ -3460,19 +4697,21 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
}
static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
- struct nft_trans *trans;
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans_set *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- if (trans->msg_type == NFT_MSG_NEWSET) {
- struct nft_set *set = nft_trans_set(trans);
+ /* its likely the id we need is at the tail, not at start */
+ list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) {
+ struct nft_set *set = trans->set;
- if (id == nft_trans_set_id(trans) &&
- nft_active_genmask(set, genmask))
- return set;
- }
+ if (id == trans->set_id &&
+ set->table == table &&
+ nft_active_genmask(set, genmask))
+ return set;
}
return ERR_PTR(-ENOENT);
}
@@ -3485,12 +4724,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
{
struct nft_set *set;
- set = nft_set_lookup(table, nla_set_name, genmask);
+ set = nft_set_lookup(net, table, nla_set_name, genmask);
if (IS_ERR(set)) {
if (!nla_set_id)
return set;
- set = nft_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
}
return set;
}
@@ -3509,6 +4748,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
+ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
+ return -EINVAL;
+
inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (inuse == NULL)
return -ENOMEM;
@@ -3516,7 +4758,7 @@ cont:
list_for_each_entry(i, &ctx->table->sets, list) {
int tmp;
- if (!nft_is_active_next(ctx->net, set))
+ if (!nft_is_active_next(ctx->net, i))
continue;
if (!sscanf(i->name, name, &tmp))
continue;
@@ -3535,7 +4777,7 @@ cont:
free_page((unsigned long)inuse);
}
- set->name = kasprintf(GFP_KERNEL, name, min + n);
+ set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
if (!set->name)
return -ENOMEM;
@@ -3551,7 +4793,7 @@ cont:
return 0;
}
-static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
u64 ms = be64_to_cpu(nla_get_be64(nla));
u64 max = (u64)(~((u64)0));
@@ -3561,11 +4803,11 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
return -ERANGE;
ms *= NSEC_PER_MSEC;
- *result = nsecs_to_jiffies64(ms);
+ *result = nsecs_to_jiffies64(ms) ? : !!ms;
return 0;
}
-static __be64 nf_jiffies64_to_msecs(u64 input)
+__be64 nf_jiffies64_to_msecs(u64 input)
{
return cpu_to_be64(jiffies64_to_msecs(input));
}
@@ -3597,26 +4839,53 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb,
return 0;
}
+static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
+{
+ if (ops->usize)
+ return ops->usize(size);
+
+ return size;
+}
+
+static noinline_for_stack int
+nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set)
+{
+ unsigned int nelems;
+ char str[40];
+ int ret;
+
+ ret = snprintf(str, sizeof(str), "%ps", set->ops);
+
+ /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped
+ * to userspace purely for informational/debug purposes.
+ */
+ DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str));
+
+ if (nla_put_string(skb, NFTA_SET_TYPE, str))
+ return -EMSGSIZE;
+
+ nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems));
+ return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems));
+}
+
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
- struct nfgenmsg *nfmsg;
- struct nlmsghdr *nlh;
+ u64 timeout = READ_ONCE(set->timeout);
+ u32 gc_int = READ_ONCE(set->gc_int);
u32 portid = ctx->portid;
+ struct nlmsghdr *nlh;
struct nlattr *nest;
u32 seq = ctx->seq;
+ int i;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, ctx->family, NFNETLINK_V0,
+ nft_base_seq_be16(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -3624,6 +4893,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
NFTA_SET_PAD))
goto nla_put_failure;
+
+ if (event == NFT_MSG_DELSET ||
+ event == NFT_MSG_DESTROYSET) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (set->flags != 0)
if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
goto nla_put_failure;
@@ -3642,13 +4918,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
goto nla_put_failure;
- if (set->timeout &&
+ if (timeout &&
nla_put_be64(skb, NFTA_SET_TIMEOUT,
- nf_jiffies64_to_msecs(set->timeout),
+ nf_jiffies64_to_msecs(timeout),
NFTA_SET_PAD))
goto nla_put_failure;
- if (set->gc_int &&
- nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int)))
+ if (gc_int &&
+ nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int)))
goto nla_put_failure;
if (set->policy != NFT_SET_POL_PERFORMANCE) {
@@ -3656,14 +4932,16 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
}
- if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
+ if (set->udata &&
+ nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
goto nla_put_failure;
nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
if (!nest)
goto nla_put_failure;
if (set->size &&
- nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE,
+ htonl(nft_set_userspace_size(set->ops, set->size))))
goto nla_put_failure;
if (set->field_count > 1 &&
@@ -3672,12 +4950,26 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_nest_end(skb, nest);
- if (set->expr) {
+ if (nf_tables_fill_set_info(skb, set))
+ goto nla_put_failure;
+
+ if (set->num_exprs == 1) {
nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
- if (nf_tables_fill_expr_info(skb, set->expr) < 0)
+ if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
+ } else if (set->num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ set->exprs[i], false) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
}
nlmsg_end(skb, nlh);
@@ -3692,8 +4984,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
const struct nft_set *set, int event,
gfp_t gfp_flags)
{
- struct sk_buff *skb;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -3704,14 +4998,16 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
- err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
+ err = nf_tables_fill_set(skb, ctx, set, event, flags);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
- gfp_flags);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -3724,14 +5020,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
struct net *net = sock_net(skb->sk);
struct nft_ctx *ctx = cb->data, ctx_set;
+ struct nftables_pernet *nft_net;
if (cb->args[1])
return skb->len;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
ctx->family != table->family)
continue;
@@ -3795,25 +5093,31 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_table *table = NULL;
+ struct net *net = info->net;
const struct nft_set *set;
- struct nft_ctx ctx;
struct sk_buff *skb2;
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_ctx ctx;
int err;
- /* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_TABLE]) {
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
+ }
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_sets_start,
.dump = nf_tables_dump_sets,
@@ -3822,18 +5126,20 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
/* Only accept unspec with dump */
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
@@ -3841,19 +5147,15 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
if (err < 0)
- goto err;
+ goto err_fill_set_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_set_info:
kfree_skb(skb2);
return err;
}
-static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
- [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
-};
-
static int nft_set_desc_concat_parse(const struct nlattr *attr,
struct nft_set_desc *desc)
{
@@ -3861,6 +5163,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
u32 len;
int err;
+ if (desc->field_count >= ARRAY_SIZE(desc->field_len))
+ return -E2BIG;
+
err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
nft_concat_policy, NULL);
if (err < 0)
@@ -3870,9 +5175,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
return -EINVAL;
len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
-
- if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT)
- return -E2BIG;
+ if (!len || len > U8_MAX)
+ return -EINVAL;
desc->field_len[desc->field_count++] = len;
@@ -3882,8 +5186,9 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
static int nft_set_desc_concat(struct nft_set_desc *desc,
const struct nlattr *nla)
{
+ u32 len = 0, num_regs;
struct nlattr *attr;
- int rem, err;
+ int rem, err, i;
nla_for_each_nested(attr, nla, rem) {
if (nla_type(attr) != NFTA_LIST_ELEM)
@@ -3894,6 +5199,16 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
return err;
}
+ for (i = 0; i < desc->field_count; i++)
+ len += round_up(desc->field_len[i], sizeof(u32));
+
+ if (len != desc->klen)
+ return -EINVAL;
+
+ num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
+ if (num_regs > NFT_REG32_COUNT)
+ return -E2BIG;
+
return 0;
}
@@ -3916,28 +5231,116 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
return err;
}
-static int nf_tables_newset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr * const *nla,
+ struct nft_expr **exprs, int *num_exprs,
+ u32 flags)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nft_expr *expr;
+ int err, i;
+
+ if (nla[NFTA_SET_EXPR]) {
+ expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ exprs[0] = expr;
+ (*num_exprs)++;
+ } else if (nla[NFTA_SET_EXPRESSIONS]) {
+ struct nlattr *tmp;
+ int left;
+
+ if (!(flags & NFT_SET_EXPR)) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_set_expr_alloc;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ exprs[i++] = expr;
+ (*num_exprs)++;
+ }
+ }
+
+ return 0;
+
+err_set_expr_alloc:
+ for (i = 0; i < *num_exprs; i++)
+ nft_expr_destroy(ctx, exprs[i]);
+
+ return err;
+}
+
+static bool nft_set_is_same(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ struct nft_expr *exprs[], u32 num_exprs, u32 flags)
+{
+ int i;
+
+ if (set->ktype != desc->ktype ||
+ set->dtype != desc->dtype ||
+ set->flags != flags ||
+ set->klen != desc->klen ||
+ set->dlen != desc->dlen ||
+ set->field_count != desc->field_count ||
+ set->num_exprs != num_exprs)
+ return false;
+
+ for (i = 0; i < desc->field_count; i++) {
+ if (set->field_len[i] != desc->field_len[i])
+ return false;
+ }
+
+ for (i = 0; i < num_exprs; i++) {
+ if (set->exprs[i]->ops != exprs[i]->ops)
+ return false;
+ }
+
+ return true;
+}
+
+static u32 nft_set_kernel_size(const struct nft_set_ops *ops,
+ const struct nft_set_desc *desc)
+{
+ if (ops->ksize)
+ return ops->ksize(desc->size);
+
+ return desc->size;
+}
+
+static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_set_ops *ops;
- struct nft_expr *expr = NULL;
+ struct net *net = info->net;
+ struct nft_set_desc desc;
struct nft_table *table;
+ unsigned char *udata;
struct nft_set *set;
struct nft_ctx ctx;
+ size_t alloc_size;
+ int num_exprs = 0;
char *name;
- u64 size;
- u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int, objtype;
- struct nft_set_desc desc;
- unsigned char *udata;
+ int err, i;
u16 udlen;
- int err;
- int i;
+ u32 flags;
+ u64 size;
if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_NAME] == NULL ||
@@ -3947,10 +5350,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
memset(&desc, 0, sizeof(desc));
- ktype = NFT_DATA_VALUE;
+ desc.ktype = NFT_DATA_VALUE;
if (nla[NFTA_SET_KEY_TYPE] != NULL) {
- ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
- if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+ desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+ if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
return -EINVAL;
}
@@ -3964,7 +5367,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
NFT_SET_MAP | NFT_SET_EVAL |
- NFT_SET_OBJECT | NFT_SET_CONCAT))
+ NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
return -EOPNOTSUPP;
/* Only one of these operations is supported */
if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
@@ -3973,19 +5376,25 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
(NFT_SET_EVAL | NFT_SET_OBJECT))
return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) ==
+ (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) ==
+ (NFT_SET_CONSTANT | NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
}
- dtype = 0;
+ desc.dtype = 0;
if (nla[NFTA_SET_DATA_TYPE] != NULL) {
if (!(flags & NFT_SET_MAP))
return -EINVAL;
- dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
- if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
- dtype != NFT_DATA_VERDICT)
+ desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+ if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+ desc.dtype != NFT_DATA_VERDICT)
return -EINVAL;
- if (dtype != NFT_DATA_VERDICT) {
+ if (desc.dtype != NFT_DATA_VERDICT) {
if (nla[NFTA_SET_DATA_LEN] == NULL)
return -EINVAL;
desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
@@ -4000,76 +5409,128 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (!(flags & NFT_SET_OBJECT))
return -EINVAL;
- objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
- if (objtype == NFT_OBJECT_UNSPEC ||
- objtype > NFT_OBJECT_MAX)
+ desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+ if (desc.objtype == NFT_OBJECT_UNSPEC ||
+ desc.objtype > NFT_OBJECT_MAX)
return -EOPNOTSUPP;
} else if (flags & NFT_SET_OBJECT)
return -EINVAL;
else
- objtype = NFT_OBJECT_UNSPEC;
+ desc.objtype = NFT_OBJECT_UNSPEC;
- timeout = 0;
+ desc.timeout = 0;
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout);
if (err)
return err;
}
- gc_int = 0;
+ desc.gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
+
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
+ desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
}
- policy = NFT_SET_POL_PERFORMANCE;
- if (nla[NFTA_SET_POLICY] != NULL)
- policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ desc.policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL) {
+ desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ switch (desc.policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ case NFT_SET_POL_MEMORY:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
if (nla[NFTA_SET_DESC] != NULL) {
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
+
+ if (desc.field_count > 1) {
+ if (!(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
+ }
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
}
- if (nla[NFTA_SET_EXPR])
+ if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
desc.expr = true;
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
if (PTR_ERR(set) != -ENOENT) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {};
+
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return 0;
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
+ err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags);
+ if (err < 0)
+ return err;
+
+ if (desc.size)
+ desc.size = nft_set_kernel_size(set->ops, &desc);
+
+ err = 0;
+ if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
+ err = -EEXIST;
+ }
+
+ for (i = 0; i < num_exprs; i++)
+ nft_expr_destroy(&ctx, exprs[i]);
+
+ if (err < 0)
+ return err;
+
+ return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
- ops = nft_select_set_ops(&ctx, nla, &desc, policy);
+ ops = nft_select_set_ops(&ctx, flags, &desc);
if (IS_ERR(ops))
return PTR_ERR(ops);
+ if (desc.size)
+ desc.size = nft_set_kernel_size(ops, &desc);
+
udlen = 0;
if (nla[NFTA_SET_USERDATA])
udlen = nla_len(nla[NFTA_SET_USERDATA]);
@@ -4077,12 +5538,20 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
size = 0;
if (ops->privsize != NULL)
size = ops->privsize(nla, &desc);
-
- set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
- if (!set)
+ alloc_size = sizeof(*set) + size + udlen;
+ if (alloc_size < size || alloc_size > INT_MAX)
return -ENOMEM;
- name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!set) {
+ err = -ENOMEM;
+ goto err_alloc;
+ }
+
+ name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
err = -ENOMEM;
goto err_set_name;
@@ -4091,15 +5560,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = nf_tables_set_alloc_name(&ctx, set, name);
kfree(name);
if (err < 0)
- goto err_set_alloc_name;
-
- if (nla[NFTA_SET_EXPR]) {
- expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]);
- if (IS_ERR(expr)) {
- err = PTR_ERR(expr);
- goto err_set_alloc_name;
- }
- }
+ goto err_set_name;
udata = NULL;
if (udlen) {
@@ -4108,23 +5569,23 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
+ INIT_LIST_HEAD(&set->catchall_list);
+ refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
- set->ops = ops;
- set->ktype = ktype;
- set->klen = desc.klen;
- set->dtype = dtype;
- set->objtype = objtype;
- set->dlen = desc.dlen;
- set->expr = expr;
+ set->ops = ops;
+ set->ktype = desc.ktype;
+ set->klen = desc.klen;
+ set->dtype = desc.dtype;
+ set->objtype = desc.objtype;
+ set->dlen = desc.dlen;
set->flags = flags;
- set->size = desc.size;
- set->policy = policy;
- set->udlen = udlen;
- set->udata = udata;
- set->timeout = timeout;
- set->gc_int = gc_int;
- set->handle = nf_tables_alloc_handle(table);
+ set->size = desc.size;
+ set->policy = desc.policy;
+ set->udlen = udlen;
+ set->udata = udata;
+ set->timeout = desc.timeout;
+ set->gc_int = desc.gc_int;
set->field_count = desc.field_count;
for (i = 0; i < desc.field_count; i++)
@@ -4134,88 +5595,133 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (err < 0)
goto err_set_init;
+ err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags);
+ if (err < 0)
+ goto err_set_destroy;
+
+ set->num_exprs = num_exprs;
+ set->handle = nf_tables_alloc_handle(table);
+ INIT_LIST_HEAD(&set->pending_update);
+
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err_set_trans;
+ goto err_set_expr_alloc;
list_add_tail_rcu(&set->list, &table->sets);
- table->use++;
+
return 0;
-err_set_trans:
- ops->destroy(set);
+err_set_expr_alloc:
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(&ctx, set->exprs[i]);
+err_set_destroy:
+ ops->destroy(&ctx, set);
err_set_init:
- if (expr)
- nft_expr_destroy(&ctx, expr);
-err_set_alloc_name:
kfree(set->name);
err_set_name:
kvfree(set);
+err_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
+static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ struct nft_set_elem_catchall *next, *catchall;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ list_del_rcu(&catchall->list);
+ nf_tables_set_elem_destroy(ctx, set, catchall->elem);
+ kfree_rcu(catchall, rcu);
+ }
+}
+
+static void nft_set_put(struct nft_set *set)
+{
+ if (refcount_dec_and_test(&set->refs)) {
+ kfree(set->name);
+ kvfree(set);
+ }
+}
+
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
+ int i;
+
if (WARN_ON(set->use > 0))
return;
- if (set->expr)
- nft_expr_destroy(ctx, set->expr);
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(ctx, set->exprs[i]);
- set->ops->destroy(set);
- kfree(set->name);
- kvfree(set);
+ set->ops->destroy(ctx, set);
+ nft_set_catchall_destroy(ctx, set);
+ nft_set_put(set);
}
-static int nf_tables_delset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- int err;
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
- if (nla[NFTA_SET_TABLE] == NULL)
- return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
if (nla[NFTA_SET_HANDLE]) {
attr = nla[NFTA_SET_HANDLE];
- set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+ set = nft_set_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_SET_NAME];
- set = nft_set_lookup(ctx.table, attr, genmask);
+ set = nft_set_lookup(net, table, attr, genmask);
}
if (IS_ERR(set)) {
+ if (PTR_ERR(set) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(set);
}
if (set->use ||
- (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+ (info->nlh->nlmsg_flags & NLM_F_NONREC &&
+ atomic_read(&set->nelems) > 0)) {
NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
}
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nft_delset(&ctx, set);
}
-static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len);
+
+static int nft_setelem_data_validate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
enum nft_registers dreg;
dreg = nft_type_to_reg(set->dtype);
@@ -4225,14 +5731,50 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
set->dlen);
}
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ return nft_setelem_data_validate(ctx, set, elem_priv);
+}
+
+static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ ret = nft_setelem_data_validate(ctx, set, catchall->elem);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
struct nft_set_binding *i;
- struct nft_set_iter iter;
-
- if (set->use == UINT_MAX)
- return -EOVERFLOW;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nf_tables_bind_check_setelem,
+ };
if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
return -EBUSY;
@@ -4247,21 +5789,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
goto bind;
}
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_bind_check_setelem;
-
set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_bind_check(ctx, set);
+
if (iter.err < 0)
return iter.err;
}
bind:
+ if (!nft_use_inc(&set->use))
+ return -EMFILE;
+
binding->chain = ctx->chain;
list_add_tail_rcu(&binding->list, &set->bindings);
nft_set_trans_bind(ctx, set);
- set->use++;
return 0;
}
@@ -4274,24 +5815,112 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
list_del_rcu(&set->list);
+ set->dead = 1;
if (event)
nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
GFP_KERNEL);
}
}
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv);
+
+static int nft_mapelem_activate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ /* called from abort path, reverse check to undo changes. */
+ if (nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, elem_priv);
+
+ return 0;
+}
+
+static void nft_map_catchall_activate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_activate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_activate(ctx, set);
+}
+
+void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(ctx, set);
+
+ nft_clear(ctx->net, set);
+ }
+
+ nft_use_inc_restore(&set->use);
+}
+EXPORT_SYMBOL_GPL(nf_tables_activate_set);
+
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding,
enum nft_trans_phase phase)
{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nft_set_trans_unbind(ctx, set);
+ if (nft_set_is_anonymous(set))
+ nft_deactivate_next(ctx->net, set);
+ else
+ list_del_rcu(&binding->list);
+
+ nft_use_dec(&set->use);
+ break;
case NFT_TRANS_PREPARE:
- set->use--;
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_deactivate_next(ctx->net, set);
+ }
+ nft_use_dec(&set->use);
return;
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
- set->use--;
- /* fall through */
+ if (nft_set_is_anonymous(set) &&
+ set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_use_dec(&set->use);
+ fallthrough;
default:
nf_tables_unbind_set(ctx, set, binding,
phase == NFT_TRANS_COMMIT);
@@ -4313,8 +5942,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_DATA] = {
.align = __alignof__(u32),
},
- [NFT_SET_EXT_EXPR] = {
- .align = __alignof__(struct nft_expr),
+ [NFT_SET_EXT_EXPRESSIONS] = {
+ .align = __alignof__(struct nft_set_elem_expr),
},
[NFT_SET_EXT_OBJREF] = {
.len = sizeof(struct nft_object *),
@@ -4325,12 +5954,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u8),
},
[NFT_SET_EXT_TIMEOUT] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
- },
- [NFT_SET_EXT_EXPIRATION] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
+ .len = sizeof(struct nft_timeout),
+ .align = __alignof__(struct nft_timeout),
},
[NFT_SET_EXT_USERDATA] = {
.len = sizeof(struct nft_userdata),
@@ -4357,6 +5982,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -4364,37 +5990,54 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
+static int nft_set_elem_expr_dump(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_ext *ext,
+ bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table;
+ struct nft_set_elem_expr *elem_expr;
+ u32 size, num_exprs = 0;
+ struct nft_expr *expr;
+ struct nlattr *nest;
- table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
- return PTR_ERR(table);
- }
+ elem_expr = nft_set_ext_expr(ext);
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ num_exprs++;
+
+ if (num_exprs == 1) {
+ expr = nft_setelem_expr_at(elem_expr, 0);
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0)
+ return -1;
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
+ return 0;
+ } else if (num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ expr = nft_setelem_expr_at(elem_expr, size);
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
return 0;
+
+nla_put_failure:
+ return -1;
}
static int nf_tables_fill_setelem(struct sk_buff *skb,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -4402,7 +6045,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nest == NULL)
goto nla_put_failure;
- if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
NFT_DATA_VALUE, set->klen) < 0)
goto nla_put_failure;
@@ -4413,12 +6057,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
- set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
- set->dlen) < 0)
+ nft_set_datatype(set), set->dlen) < 0)
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) &&
- nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_set_elem_expr_dump(skb, set, ext, reset))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
@@ -4431,25 +6074,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
htonl(*nft_set_ext_flags(ext))))
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
- nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout);
+ u64 set_timeout = READ_ONCE(set->timeout);
+ __be64 msecs = 0;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- u64 expires, now = get_jiffies_64();
+ if (set_timeout != timeout) {
+ msecs = nf_jiffies64_to_msecs(timeout);
+ if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs,
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
- expires = *nft_set_ext_expiration(ext);
- if (time_before64(now, expires))
- expires -= now;
- else
- expires = 0;
+ if (timeout > 0) {
+ u64 expires, now = get_jiffies_64();
- if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- nf_jiffies64_to_msecs(expires),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration);
+ if (time_before64(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+
+ if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
+ nf_jiffies64_to_msecs(expires),
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
}
if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
@@ -4473,40 +6123,97 @@ struct nft_set_dump_args {
const struct netlink_callback *cb;
struct nft_set_iter iter;
struct sk_buff *skb;
+ bool reset;
};
static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_set_dump_args *args;
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
+ return 0;
+
args = container_of(iter, struct nft_set_dump_args, iter);
- return nf_tables_fill_setelem(args->skb, set, elem);
+ return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
+}
+
+static void audit_log_nft_set_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
+ kfree(buf);
}
struct nft_set_dump_ctx {
const struct nft_set *set;
struct nft_ctx ctx;
+ bool reset;
};
+static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
+ const struct nft_set *set, bool reset,
+ unsigned int base_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
+ if (reset && !ret)
+ audit_log_nft_set_reset(set->table, base_seq, 1);
+ break;
+ }
+
+ return ret;
+}
+
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
struct net *net = sock_net(skb->sk);
+ struct nftables_pernet *nft_net;
struct nft_table *table;
struct nft_set *set;
- struct nft_set_dump_args args;
+ struct nft_set_dump_args args = {
+ .cb = cb,
+ .skb = skb,
+ .reset = dump_ctx->reset,
+ .iter = {
+ .genmask = nft_genmask_cur(net),
+ .type = NFT_ITER_READ,
+ .skip = cb->args[0],
+ .fn = nf_tables_dump_setelem,
+ },
+ };
bool set_found = false;
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
u32 portid, seq;
int event;
rcu_read_lock();
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
+
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
dump_ctx->ctx.family != table->family)
continue;
@@ -4532,16 +6239,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- NLM_F_MULTI);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
+ table->family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = table->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
@@ -4551,19 +6253,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.genmask = nft_genmask_cur(net);
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&dump_ctx->ctx, set, &args.iter);
- rcu_read_unlock();
+ if (!args.iter.err && args.iter.count == cb->args[0])
+ args.iter.err = nft_set_catchall_dump(net, skb, set,
+ dump_ctx->reset, cb->seq);
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
+ rcu_read_unlock();
+
if (args.iter.err && args.iter.err != -EMSGSIZE)
return args.iter.err;
if (args.iter.count == cb->args[0])
@@ -4577,6 +6276,26 @@ nla_put_failure:
return -ENOSPC;
}
+static int nf_tables_dumpreset_set(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
+ int ret, skip = cb->args[0];
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ ret = nf_tables_dump_set(skb, cb);
+
+ if (cb->args[0] > skip)
+ audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
+ cb->args[0] - skip);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_set_start(struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
@@ -4596,24 +6315,19 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
const struct nft_ctx *ctx, u32 seq,
u32 portid, int event, u16 flags,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
int err;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+ NFNETLINK_V0, nft_base_seq_be16(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -4623,7 +6337,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
if (nest == NULL)
goto nla_put_failure;
- err = nf_tables_fill_setelem(skb, set, elem);
+ err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
if (err < 0)
goto nla_put_failure;
@@ -4644,31 +6358,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
return 0;
*flags = ntohl(nla_get_be32(attr));
- if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
+ if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EOPNOTSUPP;
if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
+ if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
+ (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
return 0;
}
-static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set,
+static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set,
struct nft_data *key, struct nlattr *attr)
{
- struct nft_data_desc desc;
- int err;
-
- err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr);
- if (err < 0)
- return err;
-
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) {
- nft_data_release(key, desc.type);
- return -EINVAL;
- }
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = NFT_DATA_VALUE_MAXLEN,
+ .len = set->klen,
+ };
- return 0;
+ return nft_data_init(ctx, key, &desc, attr);
}
static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
@@ -4676,28 +6387,68 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data *data,
struct nlattr *attr)
{
- int err;
+ u32 dtype;
- err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr);
- if (err < 0)
- return err;
+ if (set->dtype == NFT_DATA_VERDICT)
+ dtype = NFT_DATA_VERDICT;
+ else
+ dtype = NFT_DATA_VALUE;
- if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) {
- nft_data_release(data, desc->type);
- return -EINVAL;
+ desc->type = dtype;
+ desc->size = NFT_DATA_VALUE_MAXLEN;
+ desc->len = set->dlen;
+ desc->flags = NFT_DATA_DESC_SETELEM;
+
+ return nft_data_init(ctx, data, desc, attr);
+}
+
+static void *nft_setelem_catchall_get(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+ void *priv = NULL;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ priv = catchall->elem;
+ break;
+ }
+
+ return priv;
+}
+
+static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ void *priv;
+
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ priv = set->ops->get(ctx->net, set, elem, flags);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+ } else {
+ priv = nft_setelem_catchall_get(ctx->net, set);
+ if (!priv)
+ return -ENOENT;
}
+ elem->priv = priv;
return 0;
}
-static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr)
+static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, bool reset)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_elem elem;
struct sk_buff *skb;
uint32_t flags = 0;
- void *priv;
int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -4705,17 +6456,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (!nla[NFTA_SET_ELEM_KEY])
- return -EINVAL;
-
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -4724,92 +6477,173 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
- priv = set->ops->get(ctx->net, set, &elem, flags);
- if (IS_ERR(priv))
- return PTR_ERR(priv);
-
- elem.priv = priv;
+ err = nft_setelem_get(ctx, set, &elem, flags);
+ if (err < 0)
+ return err;
err = -ENOMEM;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb == NULL)
- goto err1;
+ return err;
err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
- NFT_MSG_NEWSETELEM, 0, set, &elem);
+ NFT_MSG_NEWSETELEM, 0, set, elem.priv,
+ reset);
if (err < 0)
- goto err2;
+ goto err_fill_setelem;
- err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT);
- /* This avoids a loop in nfnetlink. */
- if (err < 0)
- goto err1;
+ return nfnetlink_unicast(skb, ctx->net, ctx->portid);
- return 0;
-err2:
+err_fill_setelem:
kfree_skb(skb);
-err1:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return err;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx,
+ const struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[],
+ bool reset)
{
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_set *set;
- struct nlattr *attr;
- struct nft_ctx ctx;
- int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
+
+ nft_ctx_init(&dump_ctx->ctx, net, skb,
+ info->nlh, family, table, NULL, nla);
+ dump_ctx->set = set;
+ dump_ctx->reset = reset;
+ return 0;
+}
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+/* called with rcu_read_lock held */
+static int nf_tables_getsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ struct nlattr *attr;
+ int rem, err = 0;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_set_start,
.dump = nf_tables_dump_set,
.done = nf_tables_dump_set_done,
.module = THIS_MODULE,
};
- struct nft_set_dump_ctx dump_ctx = {
- .set = set,
- .ctx = ctx,
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ c.data = &dump_ctx;
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return -EINVAL;
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int nf_tables_getsetelem_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ int rem, err = 0, nelems = 0;
+ struct nlattr *attr;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_set_start,
+ .dump = nf_tables_dumpreset_set,
+ .done = nf_tables_dump_set_done,
+ .module = THIS_MODULE,
};
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ return err;
+
c.data = &dump_ctx;
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
return -EINVAL;
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ rcu_read_lock();
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ goto out_unlock;
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_get_set_elem(&ctx, set, attr);
- if (err < 0)
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
+ }
+ nelems++;
}
+ audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);
+
+out_unlock:
+ rcu_read_unlock();
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
return err;
}
static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
const struct nft_set *set,
- const struct nft_set_elem *elem,
- int event, u16 flags)
+ const struct nft_elem_priv *elem_priv,
+ int event)
{
+ struct nftables_pernet *nft_net;
struct net *net = ctx->net;
u32 portid = ctx->portid;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -4819,31 +6653,38 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
- set, elem);
+ set, elem_priv, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
- GFP_KERNEL);
+ nft_net = nft_pernet(net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
-static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx,
int msg_type,
struct nft_set *set)
{
+ struct nft_trans_elem *te;
struct nft_trans *trans;
- trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1));
if (trans == NULL)
return NULL;
- nft_trans_elem_set(trans) = set;
+ te = nft_trans_container_elem(trans);
+ te->nelems = 1;
+ te->set = set;
+
return trans;
}
@@ -4859,9 +6700,6 @@ struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
return expr;
err = -EOPNOTSUPP;
- if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err_set_elem_expr;
-
if (expr->ops->type->flags & NFT_EXPR_GC) {
if (set->flags & NFT_SET_TIMEOUT)
goto err_set_elem_expr;
@@ -4877,39 +6715,77 @@ err_set_elem_expr:
return ERR_PTR(err);
}
-void *nft_set_elem_init(const struct nft_set *set,
- const struct nft_set_ext_tmpl *tmpl,
- const u32 *key, const u32 *key_end,
- const u32 *data, u64 timeout, u64 expiration, gfp_t gfp)
+static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
+{
+ len += nft_set_ext_types[id].len;
+ if (len > tmpl->ext_len[id] ||
+ len > U8_MAX)
+ return -1;
+
+ return 0;
+}
+
+static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
+ void *to, const void *from, u32 len)
+{
+ if (nft_set_ext_check(tmpl, id, len) < 0)
+ return -1;
+
+ memcpy(to, from, len);
+
+ return 0;
+}
+
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+ const struct nft_set_ext_tmpl *tmpl,
+ const u32 *key, const u32 *key_end,
+ const u32 *data,
+ u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
if (elem == NULL)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);
- memcpy(nft_set_ext_key(ext), key, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
- memcpy(nft_set_ext_key_end(ext), key_end, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- memcpy(nft_set_ext_data(ext), data, set->dlen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
+ nft_set_ext_key(ext), key, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
+ nft_set_ext_key_end(ext), key_end, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
+ nft_set_ext_data(ext), data, set->dlen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ nft_set_ext_timeout(ext)->timeout = timeout;
+
if (expiration == 0)
- *nft_set_ext_expiration(ext) += timeout;
+ expiration = timeout;
+
+ nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration;
}
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
- *nft_set_ext_timeout(ext) = timeout;
return elem;
+
+err_ext_check:
+ kfree(elem);
+
+ return ERR_PTR(-EINVAL);
}
-static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
- struct nft_expr *expr)
+static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
{
if (expr->ops->destroy_clone) {
expr->ops->destroy_clone(ctx, expr);
@@ -4919,77 +6795,459 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
}
}
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_set_elem_expr *elem_expr)
+{
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ __nft_set_elem_expr_destroy(ctx, expr);
+}
+
+/* Drop references and destroy. Called from gc, dynset and abort path. */
+static void __nft_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
+ bool destroy_expr)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+
+ kfree(elem_priv);
+}
+
+/* Drop references and destroy. Called from gc and dynset. */
+void nft_set_elem_destroy(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
bool destroy_expr)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
struct nft_ctx ctx = {
.net = read_pnet(&set->net),
.family = set->table->family,
};
- nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
- kfree(elem);
+ __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
-/* Only called from commit path, nft_set_elem_deactivate() already deals with
- * the refcounting from the preparation phase.
+/* Drop references and destroy. Called from abort path. */
+static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ /* skip update request, see nft_trans_elems_new_abort() */
+ if (!te->elems[i].priv)
+ continue;
+
+ __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true);
+ }
+}
+
+/* Destroy element. References have been already dropped in the preparation
+ * path via nft_setelem_data_deactivate().
*/
-static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
- const struct nft_set *set, void *elem)
+void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
- kfree(elem);
+ kfree(elem_priv);
+}
+
+static void nft_trans_elems_destroy(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++)
+ nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv);
+}
+
+int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_expr *expr_array[])
+{
+ struct nft_expr *expr;
+ int err, i, k;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
+ if (!expr)
+ goto err_expr;
+
+ err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
+ if (err < 0) {
+ kfree(expr);
+ goto err_expr;
+ }
+ expr_array[i] = expr;
+ }
+
+ return 0;
+
+err_expr:
+ for (k = i - 1; k >= 0; k--)
+ nft_expr_destroy(ctx, expr_array[k]);
+
+ return -ENOMEM;
+}
+
+static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
+ const struct nft_set_ext_tmpl *tmpl,
+ const struct nft_set_ext *ext,
+ struct nft_expr *expr_array[],
+ u32 num_exprs)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ u32 len = sizeof(struct nft_set_elem_expr);
+ struct nft_expr *expr;
+ int i, err;
+
+ if (num_exprs == 0)
+ return 0;
+
+ for (i = 0; i < num_exprs; i++)
+ len += expr_array[i]->ops->size;
+
+ if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
+ return -EINVAL;
+
+ for (i = 0; i < num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
+ if (err < 0)
+ goto err_elem_expr_setup;
+
+ elem_expr->size += expr_array[i]->ops->size;
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return 0;
+
+err_elem_expr_setup:
+ for (; i < num_exprs; i++) {
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask) &&
+ !nft_set_elem_expired(ext) &&
+ !nft_set_elem_is_dead(ext))
+ return ext;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
+
+static int nft_setelem_catchall_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_elem_priv **priv)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_next(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask)) {
+ *priv = catchall->elem;
+ return -EEXIST;
+ }
+ }
+
+ catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT);
+ if (!catchall)
+ return -ENOMEM;
+
+ catchall->elem = elem->priv;
+ list_add_tail_rcu(&catchall->list, &set->catchall_list);
+
+ return 0;
+}
+
+static int nft_setelem_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_elem_priv **elem_priv,
+ unsigned int flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
+ else
+ ret = set->ops->insert(net, set, elem, elem_priv);
+
+ return ret;
+}
+
+static bool nft_setelem_is_catchall(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
+ return true;
+
+ return false;
+}
+
+static void nft_setelem_activate(struct net *net, struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (nft_setelem_is_catchall(set, elem_priv)) {
+ nft_clear(net, ext);
+ } else {
+ set->ops->activate(net, set, elem_priv);
+ }
+}
+
+static void nft_trans_elem_update(const struct nft_set *set,
+ const struct nft_trans_one_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_elem_update *update = elem->update;
+
+ if (update->flags & NFT_TRANS_UPD_TIMEOUT)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout);
+
+ if (update->flags & NFT_TRANS_UPD_EXPIRATION)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration);
+}
+
+static void nft_trans_elems_add(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ struct nft_trans_one_elem *elem = &te->elems[i];
+
+ if (elem->update)
+ nft_trans_elem_update(te->set, elem);
+ else
+ nft_setelem_activate(ctx->net, te->set, elem->priv);
+
+ nf_tables_setelem_notify(ctx, te->set, elem->priv,
+ NFT_MSG_NEWSETELEM);
+ kfree(elem->update);
+ }
+}
+
+static int nft_setelem_catchall_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_is_active_next(net, ext))
+ continue;
+
+ kfree(elem->priv);
+ elem->priv = catchall->elem;
+ nft_set_elem_change_active(net, set, ext);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int __nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ void *priv;
+
+ priv = set->ops->deactivate(net, set, elem);
+ if (!priv)
+ return -ENOENT;
+
+ kfree(elem->priv);
+ elem->priv = priv;
+ set->ndeact++;
+
+ return 0;
+}
+
+static int nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_deactivate(net, set, elem);
+ else
+ ret = __nft_setelem_deactivate(net, set, elem);
+
+ return ret;
+}
+
+static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall)
+{
+ list_del_rcu(&catchall->list);
+ kfree_rcu(catchall, rcu);
+}
+
+static void nft_setelem_catchall_remove(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ if (catchall->elem == elem_priv) {
+ nft_setelem_catchall_destroy(catchall);
+ break;
+ }
+ }
+}
+
+static void nft_setelem_remove(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ if (nft_setelem_is_catchall(set, elem_priv))
+ nft_setelem_catchall_remove(net, set, elem_priv);
+ else
+ set->ops->remove(net, set, elem_priv);
+}
+
+static void nft_trans_elems_remove(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ WARN_ON_ONCE(te->elems[i].update);
+
+ nf_tables_setelem_notify(ctx, te->set,
+ te->elems[i].priv,
+ te->nft_trans.msg_type);
+
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) {
+ atomic_dec(&te->set->nelems);
+ te->set->ndeact--;
+ }
+ }
+}
+
+static bool nft_setelem_valid_key_end(const struct nft_set *set,
+ struct nlattr **nla, u32 flags)
+{
+ if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
+ (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ return false;
+
+ if (nla[NFTA_SET_ELEM_KEY_END] &&
+ flags & NFT_SET_ELEM_CATCHALL)
+ return false;
+ } else {
+ if (nla[NFTA_SET_ELEM_KEY_END])
+ return false;
+ }
+
+ return true;
+}
+
+static u32 nft_set_maxsize(const struct nft_set *set)
+{
+ u32 maxsize, delta;
+
+ if (!set->size)
+ return UINT_MAX;
+
+ if (set->ops->adjust_maxsize)
+ delta = set->ops->adjust_maxsize(set);
+ else
+ delta = 0;
+
+ if (check_add_overflow(set->size, set->ndeact, &maxsize))
+ return UINT_MAX;
+
+ if (check_add_overflow(maxsize, delta, &maxsize))
+ return UINT_MAX;
+
+ return maxsize;
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
u8 genmask = nft_genmask_next(ctx->net);
+ u32 flags = 0, size = 0, num_exprs = 0;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_elem_priv *elem_priv;
struct nft_object *obj = NULL;
- struct nft_expr *expr = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
enum nft_registers dreg;
struct nft_trans *trans;
- u32 flags = 0;
- u64 timeout;
u64 expiration;
+ u64 timeout;
+ int err, i;
u8 ulen;
- int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy, NULL);
if (err < 0)
return err;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
- return -EINVAL;
-
nft_set_ext_prepare(&tmpl);
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
+ if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
+ (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
+ return -EINVAL;
+
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
@@ -5000,13 +7258,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return -EINVAL;
}
+ if (set->flags & NFT_SET_OBJECT) {
+ if (!nla[NFTA_SET_ELEM_OBJREF] &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_OBJREF])
+ return -EINVAL;
+ }
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
(nla[NFTA_SET_ELEM_DATA] ||
nla[NFTA_SET_ELEM_OBJREF] ||
nla[NFTA_SET_ELEM_TIMEOUT] ||
nla[NFTA_SET_ELEM_EXPIRATION] ||
nla[NFTA_SET_ELEM_USERDATA] ||
- nla[NFTA_SET_ELEM_EXPR]))
+ nla[NFTA_SET_ELEM_EXPR] ||
+ nla[NFTA_SET_ELEM_KEY_END] ||
+ nla[NFTA_SET_ELEM_EXPRESSIONS]))
return -EINVAL;
timeout = 0;
@@ -5017,7 +7289,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
&timeout);
if (err)
return err;
- } else if (set->flags & NFT_SET_TIMEOUT) {
+ } else if (set->flags & NFT_SET_TIMEOUT &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
timeout = set->timeout;
}
@@ -5025,37 +7298,89 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
return -EINVAL;
+ if (timeout == 0)
+ return -EOPNOTSUPP;
+
err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
&expiration);
if (err)
return err;
+
+ if (expiration > timeout)
+ return -ERANGE;
}
- if (nla[NFTA_SET_ELEM_EXPR] != NULL) {
+ if (nla[NFTA_SET_ELEM_EXPR]) {
+ struct nft_expr *expr;
+
+ if (set->num_exprs && set->num_exprs != 1)
+ return -EOPNOTSUPP;
+
expr = nft_set_elem_expr_alloc(ctx, set,
nla[NFTA_SET_ELEM_EXPR]);
if (IS_ERR(expr))
return PTR_ERR(expr);
- err = -EOPNOTSUPP;
- if (set->expr && set->expr->ops != expr->ops)
+ expr_array[0] = expr;
+ num_exprs = 1;
+
+ if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
goto err_set_elem_expr;
- } else if (set->expr) {
- expr = kzalloc(set->expr->ops->size, GFP_KERNEL);
- if (!expr)
- return -ENOMEM;
+ }
+ } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX ||
+ (set->num_exprs && set->num_exprs == i)) {
+ err = -E2BIG;
+ goto err_set_elem_expr;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_elem_expr;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_elem_expr;
+ }
+ expr_array[i] = expr;
+ num_exprs++;
- err = nft_expr_clone(expr, set->expr);
- if (err < 0)
+ if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
goto err_set_elem_expr;
+ }
+ } else if (set->num_exprs > 0 &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
+ err = nft_set_elem_expr_clone(ctx, set, expr_array);
+ if (err < 0)
+ goto err_set_elem_expr_clone;
+
+ num_exprs = set->num_exprs;
}
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- goto err_set_elem_expr;
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err_set_elem_expr;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto err_parse_key;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
@@ -5063,32 +7388,46 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
goto err_parse_key;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto err_parse_key_end;
}
- if (timeout > 0) {
- nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
- if (timeout != set->timeout)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ if (set->flags & NFT_SET_TIMEOUT) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ if (err < 0)
+ goto err_parse_key_end;
}
- if (expr)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR,
- expr->ops->size);
+ if (num_exprs) {
+ for (i = 0; i < num_exprs; i++)
+ size += expr_array[i]->ops->size;
- if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
- if (!(set->flags & NFT_SET_OBJECT)) {
- err = -EINVAL;
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+ if (err < 0)
goto err_parse_key_end;
- }
+ }
+
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
obj = nft_obj_lookup(ctx->net, ctx->table,
nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
+ obj = NULL;
goto err_parse_key_end;
}
- nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+
+ if (!nft_use_inc(&obj->use)) {
+ err = -EMFILE;
+ obj = NULL;
+ goto err_parse_key_end;
+ }
+
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
@@ -5118,11 +7457,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type == NFT_DATA_VERDICT &&
(elem.data.val.verdict.code == NFT_GOTO ||
elem.data.val.verdict.code == NFT_JUMP))
- nft_validate_state_update(ctx->net,
+ nft_validate_state_update(ctx->table,
NFT_VALIDATE_NEED);
}
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ if (err < 0)
+ goto err_parse_data;
}
/* The full maximum length of userdata can exceed the maximum
@@ -5132,51 +7473,59 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ulen = 0;
if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
- if (ulen > 0)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
- ulen);
+ if (ulen > 0) {
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+ ulen);
+ if (err < 0)
+ goto err_parse_data;
+ }
}
- err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, elem.data.val.data,
- timeout, expiration, GFP_KERNEL);
- if (elem.priv == NULL)
+ timeout, expiration, GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
goto err_parse_data;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
+
+ if (obj)
+ *nft_set_ext_obj(ext) = obj;
+
if (ulen > 0) {
+ if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
+ err = -EINVAL;
+ goto err_elem_free;
+ }
udata = nft_set_ext_userdata(ext);
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
- if (obj) {
- *nft_set_ext_obj(ext) = obj;
- obj->use++;
- }
- if (expr) {
- memcpy(nft_set_ext_expr(ext), expr, expr->ops->size);
- kfree(expr);
- expr = NULL;
- }
+ err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
+ if (err < 0)
+ goto err_elem_free;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
- if (trans == NULL)
- goto err_trans;
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err_elem_free;
+ }
- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
- err = set->ops->insert(ctx->net, set, &elem, &ext2);
+ ext->genmask = nft_genmask_cur(ctx->net);
+
+ err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
if (err) {
if (err == -EEXIST) {
+ ext2 = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
- nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
- err = -EBUSY;
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
goto err_element_clash;
- }
if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
memcmp(nft_set_ext_data(ext),
@@ -5184,9 +7533,41 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
(nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
*nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
- err = -EBUSY;
- else if (!(nlmsg_flags & NLM_F_EXCL))
+ goto err_element_clash;
+ else if (!(nlmsg_flags & NLM_F_EXCL)) {
err = 0;
+ if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) {
+ struct nft_elem_update update = { };
+
+ if (timeout != nft_set_ext_timeout(ext2)->timeout) {
+ update.timeout = timeout;
+ if (expiration == 0)
+ expiration = timeout;
+
+ update.flags |= NFT_TRANS_UPD_TIMEOUT;
+ }
+ if (expiration) {
+ update.expiration = expiration;
+ update.flags |= NFT_TRANS_UPD_EXPIRATION;
+ }
+
+ if (update.flags) {
+ struct nft_trans_one_elem *ue;
+
+ ue = &nft_trans_container_elem(trans)->elems[0];
+
+ ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL);
+ if (!ue->update) {
+ err = -ENOMEM;
+ goto err_element_clash;
+ }
+
+ ue->priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
+ goto err_elem_free;
+ }
+ }
+ }
} else if (err == -ENOTEMPTY) {
/* ENOTEMPTY reports overlapping between this element
* and an existing one.
@@ -5196,46 +7577,52 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err_element_clash;
}
- if (set->size &&
- !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
- err = -ENFILE;
- goto err_set_full;
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ unsigned int max = nft_set_maxsize(set);
+
+ if (!atomic_add_unless(&set->nelems, 1, max)) {
+ err = -ENFILE;
+ goto err_set_full;
+ }
}
- nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
err_set_full:
- set->ops->remove(ctx->net, set, &elem);
+ nft_setelem_remove(ctx->net, set, elem.priv);
err_element_clash:
kfree(trans);
-err_trans:
- if (obj)
- obj->use--;
-
+err_elem_free:
nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
nft_data_release(&elem.data.val, desc.type);
err_parse_key_end:
+ if (obj)
+ nft_use_dec_restore(&obj->use);
+
nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
err_set_elem_expr:
- if (expr != NULL)
- nft_expr_destroy(ctx, expr);
-
+ for (i = 0; i < num_exprs && expr_array[i]; i++)
+ nft_expr_destroy(ctx, expr_array[i]);
+err_set_elem_expr_clone:
return err;
}
-static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_next(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err;
@@ -5243,27 +7630,36 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ if (!list_empty(&set->bindings) &&
+ (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
return -EBUSY;
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
- if (err < 0)
+ err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return err;
+ }
}
- if (net->nft.validate_state == NFT_VALIDATE_DO)
- return nft_table_validate(net, ctx.table);
+ if (table->validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
return 0;
}
@@ -5281,38 +7677,100 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
*/
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
+ struct nft_chain *chain;
+
if (type == NFT_DATA_VERDICT) {
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use++;
+ chain = data->verdict.chain;
+ nft_use_inc_restore(&chain->use);
break;
}
}
}
-static void nft_set_elem_activate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+static int nft_setelem_active_next(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ u8 genmask = nft_genmask_next(net);
+
+ return nft_set_elem_active(ext, genmask);
+}
+
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_hold(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use++;
+ nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
-static void nft_set_elem_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+}
+
+/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem.
+ * No notifications and no ndeact changes.
+ *
+ * Returns true if set had been added to (i.e., elements need to be removed again).
+ */
+static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ bool removed = false;
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (te->elems[i].update) {
+ kfree(te->elems[i].update);
+ te->elems[i].update = NULL;
+ /* Update request, so do not release this element */
+ te->elems[i].priv = NULL;
+ continue;
+ }
+
+ if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ atomic_dec(&te->set->nelems);
+
+ removed = true;
+ }
+
+ return removed;
+}
+
+/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */
+static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) {
+ nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv);
+ nft_setelem_activate(ctx->net, te->set, te->elems[i].priv);
+ }
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ te->set->ndeact--;
+ }
}
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
@@ -5324,7 +7782,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_ext *ext;
struct nft_trans *trans;
u32 flags = 0;
- void *priv;
int err;
err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
@@ -5332,39 +7789,54 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (err < 0)
return err;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
+ err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
+ if (err < 0)
+ return err;
+
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
return -EINVAL;
nft_set_ext_prepare(&tmpl);
- err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
- if (err < 0)
- return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
- err = nft_setelem_parse_key(ctx, set, &elem.key.val,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto fail_elem;
+ }
if (nla[NFTA_SET_ELEM_KEY_END]) {
err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
nla[NFTA_SET_ELEM_KEY_END]);
if (err < 0)
- return err;
+ goto fail_elem;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto fail_elem_key_end;
}
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
elem.key_end.val.data, NULL, 0, 0,
- GFP_KERNEL);
- if (elem.priv == NULL)
- goto fail_elem;
+ GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
+ goto fail_elem_key_end;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
@@ -5374,122 +7846,161 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (trans == NULL)
goto fail_trans;
- priv = set->ops->deactivate(ctx->net, set, &elem);
- if (priv == NULL) {
- err = -ENOENT;
+ err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
+ if (err < 0)
goto fail_ops;
- }
- kfree(elem.priv);
- elem.priv = priv;
- nft_set_elem_deactivate(ctx->net, set, &elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem.priv);
- nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
fail_ops:
kfree(trans);
fail_trans:
kfree(elem.priv);
+fail_elem_key_end:
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
fail_elem:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
}
-static int nft_flush_set(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_setelem_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_trans *trans;
- int err;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM,
+ struct_size_t(struct nft_trans_elem, elems, 1));
if (!trans)
return -ENOMEM;
- if (!set->ops->flush(ctx->net, set, elem->priv)) {
- err = -ENOENT;
- goto err1;
- }
+ set->ops->flush(ctx->net, set, elem_priv);
set->ndeact++;
- nft_set_elem_deactivate(ctx->net, set, elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
nft_trans_elem_set(trans) = set;
- nft_trans_elem(trans) = *elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->nelems = 1;
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
-err1:
- kfree(trans);
- return err;
}
-static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- u8 genmask = nft_genmask_next(net);
+ struct nft_trans *trans;
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
+ if (!trans)
+ return -ENOMEM;
+
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
+
+ return 0;
+}
+
+static int nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
+ if (ret < 0)
+ break;
+ nft_set_elem_change_active(ctx->net, set, ext);
+ }
+
+ return ret;
+}
+
+static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
+{
+ struct nft_set_iter iter = {
+ .genmask = genmask,
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_setelem_flush,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_flush(ctx, set);
+
+ return iter.err;
+}
+
+static int nf_tables_delsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ }
+
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
+ if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT))
return -EBUSY;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
- struct nft_set_iter iter = {
- .genmask = genmask,
- .fn = nft_flush_set,
- };
- set->ops->walk(&ctx, set, &iter);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- return iter.err;
- }
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return nft_set_flush(&ctx, set, genmask);
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
- if (err < 0)
- break;
+ if (err == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
+ continue;
- set->ndeact++;
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return err;
+ }
}
- return err;
-}
-
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
- struct nft_set_gc_batch *gcb;
- unsigned int i;
-
- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
- for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
- kfree(gcb);
-}
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp)
-{
- struct nft_set_gc_batch *gcb;
- gcb = kzalloc(sizeof(*gcb), gfp);
- if (gcb == NULL)
- return gcb;
- gcb->head.set = set;
- return gcb;
+ return 0;
}
/*
@@ -5498,7 +8009,7 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
/**
* nft_register_obj- register nf_tables stateful object type
- * @obj: object type
+ * @obj_type: object type
*
* Registers the object type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
@@ -5517,7 +8028,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj);
/**
* nft_unregister_obj - unregister nf_tables object type
- * @obj: object type
+ * @obj_type: object type
*
* Unregisters the object type for use with nf_tables.
*/
@@ -5539,7 +8050,7 @@ struct nft_object *nft_obj_lookup(const struct net *net,
struct rhlist_head *tmp, *list;
struct nft_object *obj;
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
k.name = search;
WARN_ON_ONCE(!rcu_read_lock_held() &&
@@ -5586,6 +8097,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
[NFTA_OBJ_HANDLE] = { .type = NLA_U64},
+ [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -5621,7 +8134,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
}
err = -ENOMEM;
- obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+ obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
if (!obj)
goto err2;
@@ -5658,11 +8171,15 @@ nla_put_failure:
return -1;
}
-static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
{
const struct nft_object_type *type;
- list_for_each_entry(type, &nf_tables_objects, list) {
+ list_for_each_entry_rcu(type, &nf_tables_objects, list) {
+ if (type->family != NFPROTO_UNSPEC &&
+ type->family != family)
+ continue;
+
if (objtype == type->type)
return type;
}
@@ -5670,13 +8187,17 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
}
static const struct nft_object_type *
-nft_obj_type_get(struct net *net, u32 objtype)
+nft_obj_type_get(struct net *net, u32 objtype, u8 family)
{
const struct nft_object_type *type;
- type = __nft_obj_type_get(objtype);
- if (type != NULL && try_module_get(type->owner))
+ rcu_read_lock();
+ type = __nft_obj_type_get(objtype, family);
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -5695,12 +8216,13 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
{
struct nft_object *newobj;
struct nft_trans *trans;
- int err;
+ int err = -ENOMEM;
+ /* caller must have obtained type->owner reference. */
trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
sizeof(struct nft_trans_obj));
if (!trans)
- return -ENOMEM;
+ goto err_trans;
newobj = nft_obj_init(ctx, type, attr);
if (IS_ERR(newobj)) {
@@ -5711,24 +8233,25 @@ static int nf_tables_updobj(const struct nft_ctx *ctx,
nft_trans_obj(trans) = obj;
nft_trans_obj_update(trans) = true;
nft_trans_obj_newobj(trans) = newobj;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_free_trans:
kfree(trans);
+err_trans:
+ module_put(type->owner);
return err;
}
-static int nf_tables_newobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_object_type *type;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct net *net = info->net;
struct nft_table *table;
struct nft_object *obj;
struct nft_ctx ctx;
@@ -5740,7 +8263,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
!nla[NFTA_OBJ_DATA])
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -5755,63 +8279,88 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- type = __nft_obj_type_get(objtype);
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ if (!obj->ops->update)
+ return 0;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (WARN_ON_ONCE(IS_ERR(type)))
+ return PTR_ERR(type);
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+ /* type->owner reference is put when transaction object is released. */
return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- type = nft_obj_type_get(net, objtype);
- if (IS_ERR(type))
- return PTR_ERR(type);
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (IS_ERR(type)) {
+ err = PTR_ERR(type);
+ goto err_type;
+ }
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err1;
+ goto err_init;
}
obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
- obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+ obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
if (!obj->key.name) {
err = -ENOMEM;
- goto err2;
+ goto err_strdup;
+ }
+
+ if (nla[NFTA_OBJ_USERDATA]) {
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (obj->udata == NULL)
+ goto err_userdata;
+
+ obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
}
err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
if (err < 0)
- goto err3;
+ goto err_trans;
err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
- table->use++;
+
return 0;
-err4:
+err_obj_ht:
/* queued in transaction log */
INIT_LIST_HEAD(&obj->list);
return err;
-err3:
+err_trans:
+ kfree(obj->udata);
+err_userdata:
kfree(obj->key.name);
-err2:
+err_strdup:
if (obj->ops->destroy)
obj->ops->destroy(&ctx, obj);
kfree(obj);
-err1:
+err_init:
module_put(type->owner);
+err_type:
+ nft_use_dec_restore(&table->use);
+
return err;
}
@@ -5820,28 +8369,35 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table,
struct nft_object *obj, bool reset)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
- nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
- nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELOBJ ||
+ event == NFT_MSG_DESTROYOBJ) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
+ if (obj->udata &&
+ nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -5850,168 +8406,247 @@ nla_put_failure:
return -1;
}
-struct nft_obj_filter {
+static void audit_log_obj_reset(const struct nft_table *table,
+ unsigned int base_seq, unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
+struct nft_obj_dump_ctx {
+ unsigned int s_idx;
char *table;
u32 type;
+ bool reset;
};
static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- unsigned int idx = 0, s_idx = cb->args[0];
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ unsigned int entries = 0;
struct nft_object *obj;
- bool reset = false;
-
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ unsigned int idx = 0;
+ int rc = 0;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
+ entries = 0;
list_for_each_entry_rcu(obj, &table->objects, list) {
if (!nft_is_active(net, obj))
goto cont;
- if (idx < s_idx)
+ if (idx < ctx->s_idx)
goto cont;
- if (idx > s_idx)
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- if (filter && filter->table &&
- strcmp(filter->table, table->name))
+ if (ctx->table && strcmp(ctx->table, table->name))
goto cont;
- if (filter &&
- filter->type != NFT_OBJECT_UNSPEC &&
- obj->ops->type->type != filter->type)
+ if (ctx->type != NFT_OBJECT_UNSPEC &&
+ obj->ops->type->type != ctx->type)
goto cont;
- if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFT_MSG_NEWOBJ,
- NLM_F_MULTI | NLM_F_APPEND,
- table->family, table,
- obj, reset) < 0)
- goto done;
+ rc = nf_tables_fill_obj_info(skb, net,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ table->family, table,
+ obj, ctx->reset);
+ if (rc < 0)
+ break;
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
idx++;
}
+ if (ctx->reset && entries)
+ audit_log_obj_reset(table, nft_base_seq(net), entries);
+ if (rc < 0)
+ break;
}
-done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_obj(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_obj(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_obj_filter *filter = NULL;
- if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
- filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
- if (!filter)
- return -ENOMEM;
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
- if (!filter->table) {
- kfree(filter);
- return -ENOMEM;
- }
- }
-
- if (nla[NFTA_OBJ_TYPE])
- filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ if (nla[NFTA_OBJ_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
}
- cb->data = filter;
+ if (nla[NFTA_OBJ_TYPE])
+ ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
return 0;
}
+static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
+{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_obj_start(cb);
+}
+
static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
- if (filter) {
- kfree(filter->table);
- kfree(filter);
- }
+ kfree(ctx->table);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct nft_object *obj;
struct sk_buff *skb2;
- bool reset = false;
u32 objtype;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start = nf_tables_dump_obj_start,
- .dump = nf_tables_dump_obj,
- .done = nf_tables_dump_obj_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
- }
-
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
- return PTR_ERR(obj);
+ return ERR_CAST(obj);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
-
- if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ return ERR_PTR(-ENOMEM);
- err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ err = nf_tables_fill_obj_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
family, table, obj, reset);
- if (err < 0)
- goto err;
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
- kfree_skb(skb2);
- return err;
+ return skb2;
+}
+
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct sk_buff *skb2;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_obj_start,
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ skb2 = nf_tables_getobj_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ return nfnetlink_unicast(skb2, info->net, portid);
+}
+
+static int nf_tables_getobj_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dumpreset_obj_start,
+ .dump = nf_tables_dumpreset_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getobj_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_OBJ_TABLE]),
+ (char *)nla_data(nla[NFTA_OBJ_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
@@ -6021,17 +8656,17 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
module_put(obj->ops->type->owner);
kfree(obj->key.name);
+ kfree(obj->udata);
kfree(obj);
}
-static int nf_tables_delobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_object *obj;
@@ -6042,7 +8677,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
(!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -6058,6 +8694,10 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
}
if (IS_ERR(obj)) {
+ if (PTR_ERR(obj) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(obj);
}
@@ -6066,15 +8706,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
return -EBUSY;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
return nft_delobj(&ctx, obj);
}
-void nft_obj_notify(struct net *net, const struct nft_table *table,
- struct nft_object *obj, u32 portid, u32 seq, int event,
- int family, int report, gfp_t gfp)
+static void
+__nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct sk_buff *skb;
int err;
@@ -6086,25 +8728,47 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
if (skb == NULL)
goto err;
- err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
- table, obj, false);
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event,
+ flags & (NLM_F_CREATE | NLM_F_EXCL),
+ family, table, obj, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+ nft_notify_enqueue(skb, report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+
+void nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
+{
+ char *buf = kasprintf(gfp, "%s:%u",
+ table->name, nft_base_seq(net));
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ event == NFT_MSG_NEWOBJ ?
+ AUDIT_NFT_OP_OBJ_REGISTER :
+ AUDIT_NFT_OP_OBJ_UNREGISTER,
+ gfp);
+ kfree(buf);
+
+ __nft_obj_notify(net, table, obj, portid, seq, event,
+ flags, family, report, gfp);
+}
EXPORT_SYMBOL_GPL(nft_obj_notify);
static void nf_tables_obj_notify(const struct nft_ctx *ctx,
struct nft_object *obj, int event)
{
- nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
- ctx->family, ctx->report, GFP_KERNEL);
+ __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+ ctx->seq, event, ctx->flags, ctx->family,
+ ctx->report, GFP_KERNEL);
}
/*
@@ -6136,12 +8800,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
[NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 },
};
-struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
- list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, flowtable->name) &&
nft_active_genmask(flowtable, genmask))
return flowtable;
@@ -6155,11 +8821,12 @@ void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
enum nft_trans_phase phase)
{
switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
case NFT_TRANS_PREPARE:
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
- flowtable->use--;
- /* fall through */
+ nft_use_dec(&flowtable->use);
+ fallthrough;
default:
return;
}
@@ -6193,26 +8860,31 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX
};
static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
- const struct nlattr *attr,
+ const struct nlattr * const nla[],
struct nft_flowtable_hook *flowtable_hook,
- struct nft_flowtable *flowtable, bool add)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack, bool add)
{
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int hooknum, priority;
int err;
INIT_LIST_HEAD(&flowtable_hook->list);
- err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
+ err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX,
+ nla[NFTA_FLOWTABLE_HOOK],
nft_flowtable_hook_policy, NULL);
if (err < 0)
return err;
if (add) {
if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
- !tb[NFTA_FLOWTABLE_HOOK_PRIORITY])
- return -EINVAL;
+ !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
+ return -ENOENT;
+ }
hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
if (hooknum != NF_NETDEV_INGRESS)
@@ -6242,27 +8914,32 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
err = nf_tables_parse_netdev_hooks(ctx->net,
tb[NFTA_FLOWTABLE_HOOK_DEVS],
- &flowtable_hook->list);
+ &flowtable_hook->list,
+ extack);
if (err < 0)
return err;
}
list_for_each_entry(hook, &flowtable_hook->list, list) {
- hook->ops.pf = NFPROTO_NETDEV;
- hook->ops.hooknum = flowtable_hook->num;
- hook->ops.priority = flowtable_hook->priority;
- hook->ops.priv = &flowtable->data;
- hook->ops.hook = flowtable->data.type->hook;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable_hook->num;
+ ops->priority = flowtable_hook->priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ }
}
return err;
}
+/* call under rcu_read_lock */
static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
{
const struct nf_flowtable_type *type;
- list_for_each_entry(type, &nf_tables_flowtables, list) {
+ list_for_each_entry_rcu(type, &nf_tables_flowtables, list) {
if (family == type->family)
return type;
}
@@ -6274,9 +8951,13 @@ nft_flowtable_type_get(struct net *net, u8 family)
{
const struct nf_flowtable_type *type;
+ rcu_read_lock();
type = __nft_flowtable_type_get(family);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
@@ -6289,22 +8970,58 @@ nft_flowtable_type_get(struct net *net, u8 family)
}
/* Only called from error and netdev event paths. */
-static void nft_unregister_flowtable_hook(struct net *net,
- struct nft_flowtable *flowtable,
- struct nft_hook *hook)
+static void nft_unregister_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
{
- nf_unregister_net_hook(net, &hook->ops);
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+ nf_unregister_net_hook(net, ops);
+ flowtable->data.type->setup(&flowtable->data, ops->dev,
FLOW_BLOCK_UNBIND);
}
+static void __nft_unregister_flowtable_net_hooks(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct list_head *hook_list,
+ bool release_netdev)
+{
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ }
+}
+
static void nft_unregister_flowtable_net_hooks(struct net *net,
+ struct nft_flowtable *flowtable,
struct list_head *hook_list)
{
- struct nft_hook *hook;
+ __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
+}
- list_for_each_entry(hook, hook_list, list)
- nf_unregister_net_hook(net, &hook->ops);
+static int nft_register_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
+{
+ int err;
+
+ err = flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_BIND);
+ if (err < 0)
+ return err;
+
+ err = nf_register_net_hook(net, ops);
+ if (!err)
+ return 0;
+
+ flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_UNBIND);
+ return err;
}
static int nft_register_flowtable_net_hooks(struct net *net,
@@ -6312,83 +9029,111 @@ static int nft_register_flowtable_net_hooks(struct net *net,
struct list_head *hook_list,
struct nft_flowtable *flowtable)
{
- struct nft_hook *hook, *hook2, *next;
+ struct nft_hook *hook, *next;
struct nft_flowtable *ft;
+ struct nf_hook_ops *ops;
int err, i = 0;
list_for_each_entry(hook, hook_list, list) {
list_for_each_entry(ft, &table->flowtables, list) {
- list_for_each_entry(hook2, &ft->hook_list, list) {
- if (hook->ops.dev == hook2->ops.dev &&
- hook->ops.pf == hook2->ops.pf) {
- err = -EBUSY;
- goto err_unregister_net_hooks;
- }
+ if (!nft_is_active_next(net, ft))
+ continue;
+
+ if (nft_hook_list_find(&ft->hook_list, hook)) {
+ err = -EEXIST;
+ goto err_unregister_net_hooks;
}
}
- err = flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_BIND);
- if (err < 0)
- goto err_unregister_net_hooks;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nft_register_flowtable_ops(net, flowtable, ops);
+ if (err < 0)
+ goto err_unregister_net_hooks;
- err = nf_register_net_hook(net, &hook->ops);
- if (err < 0) {
- flowtable->data.type->setup(&flowtable->data,
- hook->ops.dev,
- FLOW_BLOCK_UNBIND);
- goto err_unregister_net_hooks;
+ i++;
}
-
- i++;
}
return 0;
err_unregister_net_hooks:
list_for_each_entry_safe(hook, next, hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- nft_unregister_flowtable_hook(net, flowtable, hook);
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
}
-static void nft_flowtable_hooks_destroy(struct list_head *hook_list)
+static void nft_hooks_destroy(struct list_head *hook_list)
{
struct nft_hook *hook, *next;
list_for_each_entry_safe(hook, next, hook_list, list) {
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
}
static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
+ struct nftables_pernet *nft_net;
struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
struct nft_trans *trans;
bool unregister = false;
+ u32 flags;
int err;
- err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, false);
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
if (err < 0)
return err;
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
if (nft_hook_list_find(&flowtable->hook_list, hook)) {
list_del(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free(hook);
+ continue;
}
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
+ trans->table != ctx->table ||
+ !nft_trans_flowtable_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
+ err = -EEXIST;
+ goto err_flowtable_update_hook;
+ }
+ }
+ }
+
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ } else {
+ flags = flowtable->data.flags;
}
err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
@@ -6404,41 +9149,44 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
goto err_flowtable_update_hook;
}
+ nft_trans_flowtable_flags(trans) = flags;
nft_trans_flowtable(trans) = flowtable;
nft_trans_flowtable_update(trans) = true;
INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_flowtable_update_hook:
list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
- if (unregister)
- nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
+ if (unregister) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(ctx->net,
+ flowtable, ops);
+ }
list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ nft_netdev_hook_free_rcu(hook);
}
return err;
}
-static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct netlink_ext_ack *extack = info->extack;
struct nft_flowtable_hook flowtable_hook;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nf_flowtable_type *type;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
- struct nft_hook *hook, *next;
+ struct net *net = info->net;
struct nft_table *table;
+ struct nft_trans *trans;
struct nft_ctx ctx;
int err;
@@ -6448,13 +9196,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
}
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
if (IS_ERR(flowtable)) {
err = PTR_ERR(flowtable);
@@ -6463,27 +9211,32 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return -EEXIST;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- return nft_flowtable_update(&ctx, nlh, flowtable);
+ return nft_flowtable_update(&ctx, info->nlh, flowtable, extack);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
- if (!flowtable)
- return -ENOMEM;
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
+ if (!flowtable) {
+ err = -ENOMEM;
+ goto flowtable_alloc;
+ }
flowtable->table = table;
flowtable->handle = nf_tables_alloc_handle(table);
INIT_LIST_HEAD(&flowtable->hook_list);
- flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+ flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
if (!flowtable->name) {
err = -ENOMEM;
goto err1;
@@ -6498,8 +9251,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flowtable->data.flags =
ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
goto err3;
+ }
}
write_pnet(&flowtable->data.net, net);
@@ -6508,38 +9263,37 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (err < 0)
goto err3;
- err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, true);
+ err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable,
+ extack, true);
if (err < 0)
- goto err4;
+ goto err_flowtable_parse_hooks;
list_splice(&flowtable_hook.list, &flowtable->hook_list);
flowtable->data.priority = flowtable_hook.priority;
flowtable->hooknum = flowtable_hook.num;
+ trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto err_flowtable_trans;
+ }
+
+ /* This must be LAST to ensure no packets are walking over this flowtable. */
err = nft_register_flowtable_net_hooks(ctx.net, table,
&flowtable->hook_list,
flowtable);
- if (err < 0) {
- nft_flowtable_hooks_destroy(&flowtable->hook_list);
- goto err4;
- }
-
- err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err5;
+ goto err_flowtable_hooks;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
- table->use++;
return 0;
-err5:
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- nft_unregister_flowtable_hook(net, flowtable, hook);
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
- }
-err4:
+
+err_flowtable_hooks:
+ nft_trans_destroy(trans);
+err_flowtable_trans:
+ nft_hooks_destroy(&flowtable->hook_list);
+err_flowtable_parse_hooks:
flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
@@ -6547,64 +9301,80 @@ err2:
kfree(flowtable->name);
err1:
kfree(flowtable);
+flowtable_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
+static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook)
+{
+ struct nft_hook *this, *next;
+
+ list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
+ list_del(&this->list);
+ nft_netdev_hook_free(this);
+ }
+}
+
static int nft_delflowtable_hook(struct nft_ctx *ctx,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_flowtable_hook flowtable_hook;
- struct nft_hook *this, *next, *hook;
+ LIST_HEAD(flowtable_del_list);
+ struct nft_hook *this, *hook;
struct nft_trans *trans;
int err;
- err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
- &flowtable_hook, flowtable, false);
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
if (err < 0)
return err;
- list_for_each_entry_safe(this, next, &flowtable_hook.list, list) {
+ list_for_each_entry(this, &flowtable_hook.list, list) {
hook = nft_hook_list_find(&flowtable->hook_list, this);
if (!hook) {
err = -ENOENT;
goto err_flowtable_del_hook;
}
- hook->inactive = true;
- list_del(&this->list);
- kfree(this);
+ list_move(&hook->list, &flowtable_del_list);
}
trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
sizeof(struct nft_trans_flowtable));
- if (!trans)
- return -ENOMEM;
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_flowtable_del_hook;
+ }
nft_trans_flowtable(trans) = flowtable;
nft_trans_flowtable_update(trans) = true;
INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
+ nft_flowtable_hook_release(&flowtable_hook);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
err_flowtable_del_hook:
- list_for_each_entry(hook, &flowtable_hook.list, list)
- hook->inactive = false;
+ list_splice(&flowtable_del_list, &flowtable->hook_list);
+ nft_flowtable_hook_release(&flowtable_hook);
return err;
}
-static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
@@ -6615,7 +9385,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
@@ -6626,18 +9396,22 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_FLOWTABLE_NAME];
- flowtable = nft_flowtable_lookup(table, attr, genmask);
+ flowtable = nft_flowtable_lookup(net, table, attr, genmask);
}
if (IS_ERR(flowtable)) {
+ if (PTR_ERR(flowtable) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(flowtable);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
if (nla[NFTA_FLOWTABLE_HOOK])
- return nft_delflowtable_hook(&ctx, flowtable);
+ return nft_delflowtable_hook(&ctx, flowtable, extack);
if (flowtable->use > 0) {
NL_SET_BAD_ATTR(extack, attr);
@@ -6654,25 +9428,29 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
struct list_head *hook_list)
{
struct nlattr *nest, *nest_devs;
- struct nfgenmsg *nfmsg;
struct nft_hook *hook;
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
- NFTA_FLOWTABLE_PAD) ||
+ NFTA_FLOWTABLE_PAD))
+ goto nla_put_failure;
+
+ if (!hook_list &&
+ (event == NFT_MSG_DELFLOWTABLE ||
+ event == NFT_MSG_DESTROYFLOWTABLE)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
goto nla_put_failure;
@@ -6687,8 +9465,12 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
if (!nest_devs)
goto nla_put_failure;
- list_for_each_entry_rcu(hook, hook_list, list) {
- if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
+ if (!hook_list)
+ hook_list = &flowtable->hook_list;
+
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
+ if (nft_nla_put_hook_dev(skb, hook))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -6715,12 +9497,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -6741,8 +9525,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
NFT_MSG_NEWFLOWTABLE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- flowtable,
- &flowtable->hook_list) < 0)
+ flowtable, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -6793,21 +9576,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_flowtable_start,
.dump = nf_tables_dump_flowtable,
@@ -6816,48 +9598,54 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
.data = (void *)nla,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_FLOWTABLE_NAME])
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
- if (IS_ERR(flowtable))
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return PTR_ERR(flowtable);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
+ info->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE, 0, family,
- flowtable, &flowtable->hook_list);
+ flowtable, NULL);
if (err < 0)
- goto err;
+ goto err_fill_flowtable_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_flowtable_info:
kfree_skb(skb2);
return err;
}
static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
struct nft_flowtable *flowtable,
- struct list_head *hook_list,
- int event)
+ struct list_head *hook_list, int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct sk_buff *skb;
+ u16 flags = 0;
int err;
- if (ctx->report &&
+ if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
return;
@@ -6865,16 +9653,18 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
- ctx->seq, event, 0,
+ ctx->seq, event, flags,
ctx->family, flowtable, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -6886,10 +9676,8 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
flowtable->data.type->free(&flowtable->data);
list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
- FLOW_BLOCK_UNBIND);
list_del_rcu(&hook->list);
- kfree(hook);
+ nft_netdev_hook_free_rcu(hook);
}
kfree(flowtable->name);
module_put(flowtable->data.type->owner);
@@ -6900,20 +9688,15 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
char buf[TASK_COMM_LEN];
int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
+ NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
- if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
goto nla_put_failure;
@@ -6926,44 +9709,132 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void nft_flowtable_event(unsigned long event, struct net_device *dev,
- struct nft_flowtable *flowtable)
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops);
+
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_rcu(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
+
+static int nft_flowtable_event(unsigned long event, struct net_device *dev,
+ struct nft_flowtable *flowtable, bool changename)
{
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &flowtable->hook_list, list) {
- if (hook->ops.dev != dev)
- continue;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
- /* flow_offload_netdev_event() cleans up entries for us. */
- nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
- list_del_rcu(&hook->list);
- kfree_rcu(hook, rcu);
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ /* flow_offload_netdev_event() cleans up entries for us. */
+ nft_unregister_flowtable_ops(dev_net(dev),
+ flowtable, ops);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable->hooknum;
+ ops->priority = flowtable->data.priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ ops->dev = dev;
+ if (nft_register_flowtable_ops(dev_net(dev),
+ flowtable, ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
break;
}
+ return 0;
+}
+
+static int __nf_tables_flowtable_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nftables_pernet *nft_net = nft_pernet(dev_net(dev));
+ struct nft_flowtable *flowtable;
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ list_for_each_entry(flowtable, &table->flowtables, list) {
+ if (nft_flowtable_event(event, dev,
+ flowtable, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_flowtable_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct nft_flowtable *flowtable;
- struct nft_table *table;
+ struct nftables_pernet *nft_net;
+ int ret = NOTIFY_DONE;
struct net *net;
- if (event != NETDEV_UNREGISTER)
- return 0;
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
net = dev_net(dev);
- mutex_lock(&net->nft.commit_mutex);
- list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- nft_flowtable_event(event, dev, flowtable);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_flowtable_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
- mutex_unlock(&net->nft.commit_mutex);
-
- return NOTIFY_DONE;
+out_unlock:
+ mutex_unlock(&nft_net->commit_mutex);
+ return ret;
}
static struct notifier_block nf_tables_flowtable_notifier = {
@@ -6977,7 +9848,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
struct sk_buff *skb2;
int err;
- if (nlmsg_report(nlh) &&
+ if (!nlmsg_report(nlh) &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
return;
@@ -7000,10 +9871,8 @@ err:
-ENOBUFS);
}
-static int nf_tables_getgen(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
struct sk_buff *skb2;
int err;
@@ -7012,128 +9881,206 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
if (skb2 == NULL)
return -ENOMEM;
- err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq);
+ err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq);
if (err < 0)
- goto err;
+ goto err_fill_gen_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+
+err_fill_gen_info:
kfree_skb(skb2);
return err;
}
static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_NEWTABLE] = {
- .call_batch = nf_tables_newtable,
+ .call = nf_tables_newtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
- .call_rcu = nf_tables_gettable,
+ .call = nf_tables_gettable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_DELTABLE] = {
- .call_batch = nf_tables_deltable,
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_DESTROYTABLE] = {
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_NEWCHAIN] = {
- .call_batch = nf_tables_newchain,
+ .call = nf_tables_newchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
- .call_rcu = nf_tables_getchain,
+ .call = nf_tables_getchain,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_DELCHAIN] = {
- .call_batch = nf_tables_delchain,
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_DESTROYCHAIN] = {
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_NEWRULE] = {
- .call_batch = nf_tables_newrule,
+ .call = nf_tables_newrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
- .call_rcu = nf_tables_getrule,
+ .call = nf_tables_getrule,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_GETRULE_RESET] = {
+ .call = nf_tables_getrule_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_DELRULE] = {
- .call_batch = nf_tables_delrule,
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_DESTROYRULE] = {
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_NEWSET] = {
- .call_batch = nf_tables_newset,
+ .call = nf_tables_newset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
- .call_rcu = nf_tables_getset,
+ .call = nf_tables_getset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_DELSET] = {
- .call_batch = nf_tables_delset,
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_DESTROYSET] = {
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_NEWSETELEM] = {
- .call_batch = nf_tables_newsetelem,
+ .call = nf_tables_newsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
- .call_rcu = nf_tables_getsetelem,
+ .call = nf_tables_getsetelem,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETSETELEM_RESET] = {
+ .call = nf_tables_getsetelem_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_DELSETELEM] = {
- .call_batch = nf_tables_delsetelem,
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_DESTROYSETELEM] = {
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
- .call_rcu = nf_tables_getgen,
+ .call = nf_tables_getgen,
+ .type = NFNL_CB_RCU,
},
[NFT_MSG_NEWOBJ] = {
- .call_batch = nf_tables_newobj,
+ .call = nf_tables_newobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_DELOBJ] = {
- .call_batch = nf_tables_delobj,
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_DESTROYOBJ] = {
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_NEWFLOWTABLE] = {
- .call_batch = nf_tables_newflowtable,
+ .call = nf_tables_newflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
- .call_rcu = nf_tables_getflowtable,
+ .call = nf_tables_getflowtable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_DELFLOWTABLE] = {
- .call_batch = nf_tables_delflowtable,
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_FLOWTABLE_MAX,
+ .policy = nft_flowtable_policy,
+ },
+ [NFT_MSG_DESTROYFLOWTABLE] = {
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
@@ -7141,20 +10088,23 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
static int nf_tables_validate(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_table *table;
- switch (net->nft.validate_state) {
- case NFT_VALIDATE_SKIP:
- break;
- case NFT_VALIDATE_NEED:
- nft_validate_state_update(net, NFT_VALIDATE_DO);
- /* fall through */
- case NFT_VALIDATE_DO:
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
+ switch (table->validate_state) {
+ case NFT_VALIDATE_SKIP:
+ continue;
+ case NFT_VALIDATE_NEED:
+ nft_validate_state_update(table, NFT_VALIDATE_DO);
+ fallthrough;
+ case NFT_VALIDATE_DO:
if (nft_table_validate(net, table) < 0)
return -EAGAIN;
+
+ nft_validate_state_update(table, NFT_VALIDATE_SKIP);
+ break;
}
- break;
}
return 0;
@@ -7167,51 +10117,53 @@ static int nf_tables_validate(struct net *net)
*
* We defer the drop policy until the transaction has been finalized.
*/
-static void nft_chain_commit_drop_policy(struct nft_trans *trans)
+static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans)
{
struct nft_base_chain *basechain;
- if (nft_trans_chain_policy(trans) != NF_DROP)
+ if (trans->policy != NF_DROP)
return;
- if (!nft_is_base_chain(trans->ctx.chain))
+ if (!nft_is_base_chain(trans->chain))
return;
- basechain = nft_base_chain(trans->ctx.chain);
+ basechain = nft_base_chain(trans->chain);
basechain->policy = NF_DROP;
}
-static void nft_chain_commit_update(struct nft_trans *trans)
+static void nft_chain_commit_update(struct nft_trans_chain *trans)
{
+ struct nft_table *table = trans->nft_trans_binding.nft_trans.table;
struct nft_base_chain *basechain;
- if (nft_trans_chain_name(trans)) {
- rhltable_remove(&trans->ctx.table->chains_ht,
- &trans->ctx.chain->rhlhead,
+ if (trans->name) {
+ rhltable_remove(&table->chains_ht,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
- swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
- rhltable_insert_key(&trans->ctx.table->chains_ht,
- trans->ctx.chain->name,
- &trans->ctx.chain->rhlhead,
+ swap(trans->chain->name, trans->name);
+ rhltable_insert_key(&table->chains_ht,
+ trans->chain->name,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
}
- if (!nft_is_base_chain(trans->ctx.chain))
+ if (!nft_is_base_chain(trans->chain))
return;
nft_chain_stats_replace(trans);
- basechain = nft_base_chain(trans->ctx.chain);
+ basechain = nft_base_chain(trans->chain);
- switch (nft_trans_chain_policy(trans)) {
+ switch (trans->policy) {
case NF_DROP:
case NF_ACCEPT:
- basechain->policy = nft_trans_chain_policy(trans);
+ basechain->policy = trans->policy;
break;
}
}
-static void nft_obj_commit_update(struct nft_trans *trans)
+static void nft_obj_commit_update(const struct nft_ctx *ctx,
+ struct nft_trans *trans)
{
struct nft_object *newobj;
struct nft_object *obj;
@@ -7219,60 +10171,76 @@ static void nft_obj_commit_update(struct nft_trans *trans)
obj = nft_trans_obj(trans);
newobj = nft_trans_obj_newobj(trans);
- if (obj->ops->update)
- obj->ops->update(obj, newobj);
+ if (WARN_ON_ONCE(!obj->ops->update))
+ return;
- kfree(newobj);
+ obj->ops->update(obj, newobj);
+ nft_obj_destroy(ctx, newobj);
}
static void nft_commit_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = {
+ .net = trans->net,
+ };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_DELTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYTABLE:
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
break;
case NFT_MSG_DELCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_DELRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ case NFT_MSG_DESTROYRULE:
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(&trans->ctx, nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nf_tables_set_elem_destroy(&trans->ctx,
- nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv);
+ case NFT_MSG_DESTROYSETELEM:
+ nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_DELOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans))
- nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
else
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
if (trans->put_net)
- put_net(trans->ctx.net);
+ put_net(trans->net);
kfree(trans);
}
static void nf_tables_trans_destroy_work(struct work_struct *w)
{
+ struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
struct nft_trans *trans, *next;
LIST_HEAD(head);
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_init(&nf_tables_destroy_list, &head);
+ list_splice_init(&nft_net->destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);
if (list_empty(&head))
@@ -7281,93 +10249,149 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
synchronize_rcu();
list_for_each_entry_safe(trans, next, &head, list) {
- list_del(&trans->list);
+ nft_trans_list_del(trans);
nft_commit_release(trans);
}
}
+void nf_tables_trans_destroy_flush_work(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ flush_work(&nft_net->destroy_work);
+}
+EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
+
+static bool nft_expr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ return false;
+}
+
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
+ const struct nft_expr *expr, *last;
+ struct nft_regs_track track = {};
+ unsigned int size, data_size;
+ void *data, *data_boundary;
+ struct nft_rule_dp *prule;
struct nft_rule *rule;
- unsigned int alloc = 0;
- int i;
/* already handled or inactive chain? */
- if (chain->rules_next || !nft_is_active_next(net, chain))
+ if (chain->blob_next || !nft_is_active_next(net, chain))
return 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
- i = 0;
-
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- alloc++;
+ data_size = 0;
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule)) {
+ data_size += sizeof(*prule) + rule->dlen;
+ if (data_size > INT_MAX)
+ return -ENOMEM;
+ }
}
- chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
- if (!chain->rules_next)
+ chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size);
+ if (!chain->blob_next)
return -ENOMEM;
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- chain->rules_next[i++] = rule;
+ data = (void *)chain->blob_next->data;
+ data_boundary = data + data_size;
+ size = 0;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(net, rule))
+ continue;
+
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ size = 0;
+ track.last = nft_expr_last(rule);
+ nft_rule_for_each_expr(expr, last, rule) {
+ track.cur = expr;
+
+ if (nft_expr_reduce(&track, expr)) {
+ expr = track.cur;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
+ return -ENOMEM;
+
+ memcpy(data + size, expr, expr->ops->size);
+ size += expr->ops->size;
+ }
+ if (WARN_ON_ONCE(size >= 1 << 12))
+ return -ENOMEM;
+
+ prule->handle = rule->handle;
+ prule->dlen = size;
+ prule->is_last = 0;
+
+ data += size;
+ size = 0;
+ chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- chain->rules_next[i] = NULL;
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ prule = (struct nft_rule_dp *)data;
+ nft_last_rule(chain, prule);
+
return 0;
}
static void nf_tables_commit_chain_prepare_cancel(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
- struct nft_chain *chain = trans->ctx.chain;
-
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- kvfree(chain->rules_next);
- chain->rules_next = NULL;
+ struct nft_chain *chain = nft_trans_rule_chain(trans);
+
+ kvfree(chain->blob_next);
+ chain->blob_next = NULL;
}
}
}
-static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+static void __nf_tables_commit_chain_free_rules(struct rcu_head *h)
{
- struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+ struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h);
- kvfree(o->start);
+ kvfree(l->blob);
}
-static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
- struct nft_rule **r = rules;
- struct nft_rules_old *old;
-
- while (*r)
- r++;
+ struct nft_rule_dp_last *last;
- r++; /* rcu_head is after end marker */
- old = (void *) r;
- old->start = rules;
+ /* last rule trailer is after end marker */
+ last = (void *)blob + sizeof(*blob) + blob->size;
+ last->blob = blob;
- call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+ call_rcu(&last->h, __nf_tables_commit_chain_free_rules);
}
static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
- struct nft_rule **g0, **g1;
+ struct nft_rule_blob *g0, *g1;
bool next_genbit;
next_genbit = nft_gencursor_next(net);
- g0 = rcu_dereference_protected(chain->rules_gen_0,
+ g0 = rcu_dereference_protected(chain->blob_gen_0,
lockdep_commit_lock_is_held(net));
- g1 = rcu_dereference_protected(chain->rules_gen_1,
+ g1 = rcu_dereference_protected(chain->blob_gen_1,
lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
- if (chain->rules_next == NULL) {
+ if (chain->blob_next == NULL) {
/* chain had no change in last or next generation */
if (g0 == g1)
return;
@@ -7376,10 +10400,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
* one uses same rules as current generation.
*/
if (next_genbit) {
- rcu_assign_pointer(chain->rules_gen_1, g0);
+ rcu_assign_pointer(chain->blob_gen_1, g0);
nf_tables_commit_chain_free_rules_old(g1);
} else {
- rcu_assign_pointer(chain->rules_gen_0, g1);
+ rcu_assign_pointer(chain->blob_gen_0, g1);
nf_tables_commit_chain_free_rules_old(g0);
}
@@ -7387,11 +10411,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
}
if (next_genbit)
- rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
else
- rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);
- chain->rules_next = NULL;
+ chain->blob_next = NULL;
if (g0 == g1)
return;
@@ -7408,7 +10432,7 @@ static void nft_obj_del(struct nft_object *obj)
list_del_rcu(&obj->list);
}
-static void nft_chain_del(struct nft_chain *chain)
+void nft_chain_del(struct nft_chain *chain)
{
struct nft_table *table = chain->table;
@@ -7417,23 +10441,250 @@ static void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
-static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable,
- struct list_head *hook_list)
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+ struct nft_trans_gc *trans)
{
- struct nft_hook *hook, *next;
+ struct nft_elem_priv **priv = trans->priv;
+ unsigned int i;
- list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
- if (hook->inactive)
- list_move(&hook->list, hook_list);
+ for (i = 0; i < trans->count; i++) {
+ nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
+ nft_setelem_remove(ctx->net, trans->set, priv[i]);
}
}
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+ nft_set_put(trans->set);
+ put_net(trans->net);
+ kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+ struct nft_elem_priv *elem_priv;
+ struct nft_trans_gc *trans;
+ struct nft_ctx ctx = {};
+ unsigned int i;
+
+ trans = container_of(rcu, struct nft_trans_gc, rcu);
+ ctx.net = read_pnet(&trans->set->net);
+
+ for (i = 0; i < trans->count; i++) {
+ elem_priv = trans->priv[i];
+ if (!nft_setelem_is_catchall(trans->set, elem_priv))
+ atomic_dec(&trans->set->nelems);
+
+ nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
+ }
+
+ nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+ struct nftables_pernet *nft_net;
+ struct nft_ctx ctx = {};
+
+ nft_net = nft_pernet(trans->net);
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ /* Check for race with transaction, otherwise this batch refers to
+ * stale objects that might not be there anymore. Skip transaction if
+ * set has been destroyed from control plane transaction in case gc
+ * worker loses race.
+ */
+ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return false;
+ }
+
+ ctx.net = trans->net;
+ ctx.table = trans->set->table;
+
+ nft_trans_gc_setelem_remove(&ctx, trans);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+ struct nft_trans_gc *trans, *next;
+ LIST_HEAD(trans_gc_list);
+
+ spin_lock(&nf_tables_gc_list_lock);
+ list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+ list_del(&trans->list);
+ if (!nft_trans_gc_work_done(trans)) {
+ nft_trans_gc_destroy(trans);
+ continue;
+ }
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+ }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct net *net = read_pnet(&set->net);
+ struct nft_trans_gc *trans;
+
+ trans = kzalloc(sizeof(*trans), gfp);
+ if (!trans)
+ return NULL;
+
+ trans->net = maybe_get_net(net);
+ if (!trans->net) {
+ kfree(trans);
+ return NULL;
+ }
+
+ refcount_inc(&set->refs);
+ trans->set = set;
+ trans->seq = gc_seq;
+
+ return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+ trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+ spin_lock(&nf_tables_gc_list_lock);
+ list_add_tail(&trans->list, &nf_tables_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ nft_trans_gc_queue_work(gc);
+
+ return nft_trans_gc_alloc(set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+ return NULL;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+ return nft_trans_gc_alloc(set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ const struct nft_set *set = gc->set;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext))
+ continue;
+ if (nft_set_elem_is_dead(ext))
+ goto dead_elem;
+
+ nft_set_elem_dead(ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ return NULL;
+
+ nft_trans_gc_elem_add(gc, catchall->elem);
+ }
+
+ return gc;
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+ u64 tstamp = nft_net_tstamp(gc->net);
+ const struct nft_set *set = gc->set;
+ struct nft_elem_priv *elem_priv;
+ struct nft_set_ext *ext;
+
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!__nft_set_elem_expired(ext, tstamp))
+ continue;
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return NULL;
+
+ elem_priv = catchall->elem;
+ nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
+ nft_setelem_catchall_destroy(catchall);
+ nft_trans_gc_elem_add(gc, elem_priv);
+ }
+
+ return gc;
+}
+
static void nf_tables_module_autoload_cleanup(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_module_request *req, *next;
- WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
- list_for_each_entry_safe(req, next, &net->nft.module_list, list) {
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ list_for_each_entry_safe(req, next, &nft_net->module_list, list) {
WARN_ON_ONCE(!req->done);
list_del(&req->list);
kfree(req);
@@ -7442,6 +10693,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net)
static void nf_tables_commit_release(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;
/* all side effects have to be made visible.
@@ -7451,67 +10703,253 @@ static void nf_tables_commit_release(struct net *net)
* Memory reclaim happens asynchronously from work queue
* to prevent expensive synchronize_rcu() in commit phase.
*/
- if (list_empty(&net->nft.commit_list)) {
+ if (list_empty(&nft_net->commit_list)) {
nf_tables_module_autoload_cleanup(net);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return;
}
- trans = list_last_entry(&net->nft.commit_list,
+ trans = list_last_entry(&nft_net->commit_list,
struct nft_trans, list);
- get_net(trans->ctx.net);
+ get_net(trans->net);
WARN_ON_ONCE(trans->put_net);
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
nf_tables_module_autoload_cleanup(net);
- mutex_unlock(&net->nft.commit_mutex);
+ schedule_work(&nft_net->destroy_work);
+
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
+static void nft_commit_notify(struct net *net, u32 portid)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct sk_buff *batch_skb = NULL, *nskb, *skb;
+ unsigned char *data;
+ int len;
- schedule_work(&trans_destroy_work);
+ list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) {
+ if (!batch_skb) {
+new_batch:
+ batch_skb = skb;
+ len = NLMSG_GOODSIZE - skb->len;
+ list_del(&skb->list);
+ continue;
+ }
+ len -= skb->len;
+ if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
+ data = skb_put(batch_skb, skb->len);
+ memcpy(data, skb->data, skb->len);
+ list_del(&skb->list);
+ kfree_skb(skb);
+ continue;
+ }
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ goto new_batch;
+ }
+
+ if (batch_skb) {
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ }
+
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+}
+
+static int nf_tables_commit_audit_alloc(struct list_head *adl,
+ struct nft_table *table)
+{
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ return 0;
+ }
+ adp = kzalloc(sizeof(*adp), GFP_KERNEL);
+ if (!adp)
+ return -ENOMEM;
+ adp->table = table;
+ list_add(&adp->list, adl);
+ return 0;
+}
+
+static void nf_tables_commit_audit_free(struct list_head *adl)
+{
+ struct nft_audit_data *adp, *adn;
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ list_del(&adp->list);
+ kfree(adp);
+ }
+}
+
+/* nft audit emits the number of elements that get added/removed/updated,
+ * so NEW/DELSETELEM needs to increment based on the total elem count.
+ */
+static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_container_elem(trans)->nelems;
+ }
+
+ return 1;
+}
+
+static void nf_tables_commit_audit_collect(struct list_head *adl,
+ const struct nft_trans *trans, u32 op)
+{
+ const struct nft_table *table = trans->table;
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ goto found;
+ }
+ WARN_ONCE(1, "table=%s not expected in commit list", table->name);
+ return;
+found:
+ adp->entries += nf_tables_commit_audit_entrycount(trans);
+ if (!adp->op || adp->op > op)
+ adp->op = op;
+}
+
+#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22)
+
+static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
+{
+ struct nft_audit_data *adp, *adn;
+ char aubuf[AUNFTABLENAMELEN];
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name,
+ generation);
+ audit_log_nfcfg(aubuf, adp->table->family, adp->entries,
+ nft2audit_op[adp->op], GFP_KERNEL);
+ list_del(&adp->list);
+ kfree(adp);
+ }
+}
+
+static void nft_set_commit_update(struct list_head *set_update_list)
+{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->commit || set->dead)
+ continue;
+
+ set->ops->commit(set);
+ }
+}
+
+static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net)
+{
+ unsigned int gc_seq;
+
+ /* Bump gc counter, it becomes odd, this is the busy mark. */
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
+ return gc_seq;
+}
+
+static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq)
+{
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
}
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ const struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ struct nft_trans_binding *trans_binding;
struct nft_trans *trans, *next;
+ unsigned int base_seq, gc_seq;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ struct nft_ctx ctx;
+ LIST_HEAD(adl);
int err;
- if (list_empty(&net->nft.commit_list)) {
- mutex_unlock(&net->nft.commit_mutex);
+ if (list_empty(&nft_net->commit_list)) {
+ mutex_unlock(&nft_net->commit_mutex);
return 0;
}
+ nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL);
+
+ list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) {
+ trans = &trans_binding->nft_trans;
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)) &&
+ !nft_trans_set_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound set\n");
+ return -EINVAL;
+ }
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)) &&
+ !nft_trans_chain_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound chain\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type);
+ break;
+ }
+ }
+
/* 0. Validate ruleset, otherwise roll back for error reporting. */
- if (nf_tables_validate(net) < 0)
+ if (nf_tables_validate(net) < 0) {
+ nft_net->validate_state = NFT_VALIDATE_DO;
return -EAGAIN;
+ }
err = nft_flow_rule_offload_commit(net);
if (err < 0)
return err;
/* 1. Allocate space for next generation rules_gen_X[] */
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ struct nft_table *table = trans->table;
int ret;
+ ret = nf_tables_commit_audit_alloc(&adl, table);
+ if (ret) {
+ nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
+ return ret;
+ }
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- chain = trans->ctx.chain;
+ chain = nft_trans_rule_chain(trans);
ret = nf_tables_commit_chain_prepare(net, chain);
if (ret < 0) {
nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
return ret;
}
}
}
/* step 2. Make rules_gen_X visible to packet path */
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(chain, &table->chains, list)
nf_tables_commit_chain(net, chain);
}
@@ -7520,124 +10958,178 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- while (++net->nft.base_seq == 0);
+ base_seq = nft_base_seq(net);
+ while (++base_seq == 0)
+ ;
+
+ /* pairs with smp_load_acquire in nft_lookup_eval */
+ smp_store_release(&net->nft.base_seq, base_seq);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
+ nf_tables_commit_audit_collect(&adl, trans, trans->msg_type);
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
}
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ nf_tables_table_disable(net, table);
+
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
} else {
- nft_clear(net, trans->ctx.table);
+ nft_clear(net, table);
}
- nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELTABLE:
- list_del_rcu(&trans->ctx.table->list);
- nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ case NFT_MSG_DESTROYTABLE:
+ list_del_rcu(&table->list);
+ nf_tables_table_notify(&ctx, trans->msg_type);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- nft_chain_commit_update(trans);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_update(nft_trans_container_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
+ &nft_trans_chain_hooks(trans));
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
/* trans destroyed after rcu grace period */
} else {
- nft_chain_commit_drop_policy(trans);
- nft_clear(net, trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
+ nft_clear(net, nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL);
nft_trans_destroy(trans);
}
break;
case NFT_MSG_DELCHAIN:
- nft_chain_del(trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ &nft_trans_chain_hooks(trans));
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
+ } else {
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ NULL);
+ nf_tables_unregister_hook(ctx.net, ctx.table,
+ nft_trans_chain(trans));
+ }
break;
case NFT_MSG_NEWRULE:
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
+ nft_clear(net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
NFT_MSG_NEWRULE);
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_DELRULE:
+ case NFT_MSG_DESTROYRULE:
list_del_rcu(&nft_trans_rule(trans)->list);
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
- NFT_MSG_DELRULE);
- nft_rule_expr_deactivate(&trans->ctx,
- nft_trans_rule(trans),
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
+ trans->msg_type);
+ nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans),
NFT_TRANS_COMMIT);
+
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_clear(net, nft_trans_set(trans));
- /* This avoids hitting -EBUSY when deleting the table
- * from the transaction.
- */
- if (nft_set_is_anonymous(nft_trans_set(trans)) &&
- !list_empty(&nft_trans_set(trans)->bindings))
- trans->ctx.table->use--;
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
+ if (nft_trans_set_update(trans)) {
+ struct nft_set *set = nft_trans_set(trans);
+
+ WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans));
+ WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans));
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ if (nft_trans_set_size(trans))
+ WRITE_ONCE(set->size, nft_trans_set_size(trans));
+ } else {
+ nft_clear(net, nft_trans_set(trans));
+ /* This avoids hitting -EBUSY when deleting the table
+ * from the transaction.
+ */
+ if (nft_set_is_anonymous(nft_trans_set(trans)) &&
+ !list_empty(&nft_trans_set(trans)->bindings))
+ nft_use_dec(&table->use);
+ }
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
NFT_MSG_NEWSET, GFP_KERNEL);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSET:
+ case NFT_MSG_DESTROYSET:
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
- NFT_MSG_DELSET, GFP_KERNEL);
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
+ trans->msg_type, GFP_KERNEL);
break;
case NFT_MSG_NEWSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ te = nft_trans_container_elem(trans);
+
+ nft_trans_elems_add(&ctx, te);
- te->set->ops->activate(net, te->set, &te->elem);
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_NEWSETELEM, 0);
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_DELSETELEM, 0);
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
- te->set->ndeact--;
+ nft_trans_elems_remove(&ctx, te);
+
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- nft_obj_commit_update(trans);
- nf_tables_obj_notify(&trans->ctx,
+ nft_obj_commit_update(&ctx, trans);
+ nf_tables_obj_notify(&ctx,
nft_trans_obj(trans),
NFT_MSG_NEWOBJ);
} else {
nft_clear(net, nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx,
+ nf_tables_obj_notify(&ctx,
nft_trans_obj(trans),
NFT_MSG_NEWOBJ);
nft_trans_destroy(trans);
}
break;
case NFT_MSG_DELOBJ:
+ case NFT_MSG_DESTROYOBJ:
nft_obj_del(nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
- NFT_MSG_DELOBJ);
+ nf_tables_obj_notify(&ctx, nft_trans_obj(trans),
+ trans->msg_type);
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- nf_tables_flowtable_notify(&trans->ctx,
+ nft_trans_flowtable(trans)->data.flags =
+ nft_trans_flowtable_flags(trans);
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
NFT_MSG_NEWFLOWTABLE);
@@ -7645,37 +11137,45 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
&nft_trans_flowtable(trans)->hook_list);
} else {
nft_clear(net, nft_trans_flowtable(trans));
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
- &nft_trans_flowtable(trans)->hook_list,
+ NULL,
NFT_MSG_NEWFLOWTABLE);
}
nft_trans_destroy(trans);
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- nft_flowtable_hooks_del(nft_trans_flowtable(trans),
- &nft_trans_flowtable_hooks(trans));
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans),
- NFT_MSG_DELFLOWTABLE);
+ trans->msg_type);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans));
} else {
list_del_rcu(&nft_trans_flowtable(trans)->list);
- nf_tables_flowtable_notify(&trans->ctx,
+ nf_tables_flowtable_notify(&ctx,
nft_trans_flowtable(trans),
- &nft_trans_flowtable(trans)->hook_list,
- NFT_MSG_DELFLOWTABLE);
+ NULL,
+ trans->msg_type);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable(trans)->hook_list);
}
break;
}
}
+ nft_set_commit_update(&set_update_list);
+
+ nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+ nf_tables_commit_audit_log(&adl, nft_base_seq(net));
+
+ nft_gc_seq_end(nft_net, gc_seq);
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
nf_tables_commit_release(net);
return 0;
@@ -7683,44 +11183,51 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
static void nf_tables_module_autoload(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_module_request *req, *next;
LIST_HEAD(module_list);
- list_splice_init(&net->nft.module_list, &module_list);
- mutex_unlock(&net->nft.commit_mutex);
+ list_splice_init(&nft_net->module_list, &module_list);
+ mutex_unlock(&nft_net->commit_mutex);
list_for_each_entry_safe(req, next, &module_list, list) {
request_module("%s", req->module);
req->done = true;
}
- mutex_lock(&net->nft.commit_mutex);
- list_splice(&module_list, &net->nft.module_list);
+ mutex_lock(&nft_net->commit_mutex);
+ list_splice(&module_list, &nft_net->module_list);
}
static void nf_tables_abort_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = { };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_NEWRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(&trans->ctx, nft_trans_set(trans));
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
- nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv, true);
+ nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_NEWOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans))
- nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
else
nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
@@ -7728,73 +11235,145 @@ static void nf_tables_abort_release(struct nft_trans *trans)
kfree(trans);
}
-static int __nf_tables_abort(struct net *net, bool autoload)
+static void nft_set_abort_update(struct list_head *set_update_list)
{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->abort)
+ continue;
+
+ set->ops->abort(set);
+ }
+}
+
+static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
- struct nft_hook *hook;
+ struct nft_ctx ctx = {
+ .net = net,
+ };
+ int err = 0;
+
+ if (action == NFNL_ABORT_VALIDATE &&
+ nf_tables_validate(net) < 0)
+ err = -EAGAIN;
- list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
list) {
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
}
+ if (table->flags & __NFT_TABLE_F_WAS_DORMANT) {
+ nf_tables_table_disable(net, table);
+ table->flags |= NFT_TABLE_F_DORMANT;
+ } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
+ table->flags &= ~NFT_TABLE_F_DORMANT;
+ }
+ if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ table->nlpid = 0;
+ }
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
nft_trans_destroy(trans);
} else {
- list_del_rcu(&trans->ctx.table->list);
+ list_del_rcu(&table->list);
}
break;
case NFT_MSG_DELTABLE:
- nft_clear(trans->ctx.net, trans->ctx.table);
+ case NFT_MSG_DESTROYTABLE:
+ nft_clear(trans->net, table);
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
} else {
- trans->ctx.table->use--;
- nft_chain_del(trans->ctx.chain);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ if (nft_trans_chain_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&table->use);
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_unregister_hook(trans->net, table,
+ nft_trans_chain(trans));
}
break;
case NFT_MSG_DELCHAIN:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
+ } else {
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_chain(trans));
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWRULE:
- trans->ctx.chain->use--;
+ if (nft_trans_rule_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&nft_trans_rule_chain(trans)->use);
list_del_rcu(&nft_trans_rule(trans)->list);
- nft_rule_expr_deactivate(&trans->ctx,
+ nft_rule_expr_deactivate(&ctx,
nft_trans_rule(trans),
NFT_TRANS_ABORT);
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_DELRULE:
- trans->ctx.chain->use++;
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
+ case NFT_MSG_DESTROYRULE:
+ nft_use_inc_restore(&nft_trans_rule_chain(trans)->use);
+ nft_clear(trans->net, nft_trans_rule(trans));
+ nft_rule_expr_activate(&ctx, nft_trans_rule(trans));
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSET:
- trans->ctx.table->use--;
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
+ if (nft_trans_set_update(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&table->use);
if (nft_trans_set_bound(trans)) {
nft_trans_destroy(trans);
break;
}
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
break;
case NFT_MSG_DELSET:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_set(trans));
+ if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(&ctx, nft_trans_set(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSETELEM:
@@ -7802,96 +11381,125 @@ static int __nf_tables_abort(struct net *net, bool autoload)
nft_trans_destroy(trans);
break;
}
- te = (struct nft_trans_elem *)trans->data;
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
+ te = nft_trans_container_elem(trans);
+ if (!nft_trans_elems_new_abort(&ctx, te)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
- nft_set_elem_activate(net, te->set, &te->elem);
- te->set->ops->activate(net, te->set, &te->elem);
- te->set->ndeact--;
+ nft_trans_elems_destroy_abort(&ctx, te);
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWOBJ:
if (nft_trans_obj_update(trans)) {
- kfree(nft_trans_obj_newobj(trans));
+ nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans));
nft_trans_destroy(trans);
} else {
- trans->ctx.table->use--;
+ nft_use_dec_restore(&table->use);
nft_obj_del(nft_trans_obj(trans));
}
break;
case NFT_MSG_DELOBJ:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_obj(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable_hooks(trans));
} else {
- trans->ctx.table->use--;
+ nft_use_dec_restore(&table->use);
list_del_rcu(&nft_trans_flowtable(trans)->list);
nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
&nft_trans_flowtable(trans)->hook_list);
}
break;
case NFT_MSG_DELFLOWTABLE:
+ case NFT_MSG_DESTROYFLOWTABLE:
if (nft_trans_flowtable_update(trans)) {
- list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list)
- hook->inactive = false;
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
} else {
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_flowtable(trans));
}
nft_trans_destroy(trans);
break;
}
}
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ nft_set_abort_update(&set_update_list);
+
synchronize_rcu();
list_for_each_entry_safe_reverse(trans, next,
- &net->nft.commit_list, list) {
- list_del(&trans->list);
+ &nft_net->commit_list, list) {
+ nft_trans_list_del(trans);
nf_tables_abort_release(trans);
}
- if (autoload)
- nf_tables_module_autoload(net);
- else
- nf_tables_module_autoload_cleanup(net);
-
- return 0;
+ return err;
}
-static void nf_tables_cleanup(struct net *net)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb,
+ enum nfnl_abort_action action)
{
- nft_validate_state_update(net, NFT_VALIDATE_SKIP);
-}
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ unsigned int gc_seq;
+ int ret;
-static int nf_tables_abort(struct net *net, struct sk_buff *skb, bool autoload)
-{
- int ret = __nf_tables_abort(net, autoload);
+ gc_seq = nft_gc_seq_begin(nft_net);
+ ret = __nf_tables_abort(net, action);
+ nft_gc_seq_end(nft_net, gc_seq);
- mutex_unlock(&net->nft.commit_mutex);
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+
+ /* module autoload needs to happen after GC sequence update because it
+ * temporarily releases and grabs mutex again.
+ */
+ if (action == NFNL_ABORT_AUTOLOAD)
+ nf_tables_module_autoload(net);
+ else
+ nf_tables_module_autoload_cleanup(net);
+
+ mutex_unlock(&nft_net->commit_mutex);
return ret;
}
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
bool genid_ok;
- mutex_lock(&net->nft.commit_mutex);
+ mutex_lock(&nft_net->commit_mutex);
+ nft_net->tstamp = get_jiffies_64();
- genid_ok = genid == 0 || net->nft.base_seq == genid;
+ genid_ok = genid == 0 || nft_base_seq(net) == genid;
if (!genid_ok)
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
/* else, commit mutex has to be released by commit or abort function */
return genid_ok;
@@ -7904,7 +11512,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
- .cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
.owner = THIS_MODULE,
};
@@ -7941,106 +11548,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
-/*
- * Loop detection - walk through the ruleset beginning at the destination chain
- * of a new jump until either the source chain is reached (loop) or all
- * reachable chains have been traversed.
- *
- * The loop check is performed whenever a new jump verdict is added to an
- * expression or verdict map or a verdict map is bound to a new chain.
- */
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain);
-
-static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- const struct nft_data *data;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- return nf_tables_check_loops(ctx, data->verdict.chain);
- default:
- return 0;
- }
-}
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain)
-{
- const struct nft_rule *rule;
- const struct nft_expr *expr, *last;
- struct nft_set *set;
- struct nft_set_binding *binding;
- struct nft_set_iter iter;
-
- if (ctx->chain == chain)
- return -ELOOP;
-
- list_for_each_entry(rule, &chain->rules, list) {
- nft_rule_for_each_expr(expr, last, rule) {
- struct nft_immediate_expr *priv;
- const struct nft_data *data;
- int err;
-
- if (strcmp(expr->ops->type->name, "immediate"))
- continue;
-
- priv = nft_expr_priv(expr);
- if (priv->dreg != NFT_REG_VERDICT)
- continue;
-
- data = &priv->data;
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- err = nf_tables_check_loops(ctx,
- data->verdict.chain);
- if (err < 0)
- return err;
- default:
- break;
- }
- }
- }
-
- list_for_each_entry(set, &ctx->table->sets, list) {
- if (!nft_is_active_next(ctx->net, set))
- continue;
- if (!(set->flags & NFT_SET_MAP) ||
- set->dtype != NFT_DATA_VERDICT)
- continue;
-
- list_for_each_entry(binding, &set->bindings, list) {
- if (!(binding->flags & NFT_SET_MAP) ||
- binding->chain != chain)
- continue;
-
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_loop_check_setelem;
-
- set->ops->walk(ctx, set, &iter);
- if (iter.err < 0)
- return iter.err;
- }
- }
-
- return 0;
-}
-
/**
* nft_parse_u32_check - fetch u32 attribute and check for maximum value
*
@@ -8066,28 +11573,24 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);
-/**
- * nft_parse_register - parse a register value from a netlink attribute
- *
- * @attr: netlink attribute
- *
- * Parse and translate a register value from a netlink attribute.
- * Registers used to be 128 bit wide, these register numbers will be
- * mapped to the corresponding 32 bit register numbers.
- */
-unsigned int nft_parse_register(const struct nlattr *attr)
+static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
unsigned int reg;
reg = ntohl(nla_get_be32(attr));
switch (reg) {
case NFT_REG_VERDICT...NFT_REG_4:
- return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ break;
+ case NFT_REG32_00...NFT_REG32_15:
+ *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ break;
default:
- return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ return -ERANGE;
}
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_parse_register);
/**
* nft_dump_register - dump a register value to a netlink attribute
@@ -8111,16 +11614,7 @@ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
}
EXPORT_SYMBOL_GPL(nft_dump_register);
-/**
- * nft_validate_register_load - validate a load from a register
- *
- * @reg: the register number
- * @len: the length of the data
- *
- * Validate that the input register is one of the general purpose
- * registers and that the length of the load is within the bounds.
- */
-int nft_validate_register_load(enum nft_registers reg, unsigned int len)
+static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
{
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
@@ -8131,26 +11625,56 @@ int nft_validate_register_load(enum nft_registers reg, unsigned int len)
return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_register_load);
-/**
- * nft_validate_register_store - validate an expressions' register store
- *
- * @ctx: context of the expression performing the load
- * @reg: the destination register number
- * @data: the data to load
- * @type: the data type
- * @len: the length of the data
- *
- * Validate that a data load uses the appropriate data type for
- * the destination register and the length is within the bounds.
- * A value of NULL for the data means that its runtime gathered
- * data.
- */
-int nft_validate_register_store(const struct nft_ctx *ctx,
- enum nft_registers reg,
- const struct nft_data *data,
- enum nft_data_types type, unsigned int len)
+int nft_parse_register_load(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *sreg, u32 len)
+{
+ int err, invalid_reg;
+ u32 reg, next_register;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_load(reg, len);
+ if (err < 0)
+ return err;
+
+ next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg;
+
+ /* Can't happen: nft_validate_register_load() should have failed */
+ if (WARN_ON_ONCE(next_register > NFT_REG32_NUM))
+ return -EINVAL;
+
+ /* find first register that did not see an earlier store. */
+ invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg);
+
+ /* invalid register within the range that we're loading from? */
+ if (invalid_reg < next_register)
+ return -ENODATA;
+
+ *sreg = reg;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_register_load);
+
+static void nft_saw_register_store(const struct nft_ctx *__ctx,
+ int reg, unsigned int len)
+{
+ unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ struct nft_ctx *ctx = (struct nft_ctx *)__ctx;
+
+ if (WARN_ON_ONCE(len == 0 || reg < 0))
+ return;
+
+ bitmap_set(ctx->reg_inited, reg, registers);
+}
+
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len)
{
int err;
@@ -8162,13 +11686,16 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
if (data != NULL &&
(data->verdict.code == NFT_GOTO ||
data->verdict.code == NFT_JUMP)) {
- err = nf_tables_check_loops(ctx, data->verdict.chain);
+ err = nft_chain_validate(ctx, data->verdict.chain);
if (err < 0)
return err;
}
- return 0;
+ break;
default:
+ if (type != NFT_DATA_VALUE)
+ return -EINVAL;
+
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
if (len == 0)
@@ -8177,17 +11704,39 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
sizeof_field(struct nft_regs, data))
return -ERANGE;
- if (data != NULL && type != NFT_DATA_VALUE)
- return -EINVAL;
- return 0;
+ break;
}
+
+ nft_saw_register_store(ctx, reg, len);
+ return 0;
+}
+
+int nft_parse_register_store(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *dreg,
+ const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ int err;
+ u32 reg;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_store(ctx, reg, data, type, len);
+ if (err < 0)
+ return err;
+
+ *dreg = reg;
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_register_store);
+EXPORT_SYMBOL_GPL(nft_parse_register_store);
static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
[NFTA_VERDICT_CODE] = { .type = NLA_U32 },
[NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 },
};
static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
@@ -8205,50 +11754,68 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
if (!tb[NFTA_VERDICT_CODE])
return -EINVAL;
+
+ /* zero padding hole for memcmp */
+ memset(data, 0, sizeof(*data));
data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
switch (data->verdict.code) {
- default:
- switch (data->verdict.code & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_DROP:
- case NF_QUEUE:
- break;
- default:
- return -EINVAL;
- }
- /* fall through */
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
case NFT_CONTINUE:
case NFT_BREAK:
case NFT_RETURN:
break;
case NFT_JUMP:
case NFT_GOTO:
- if (!tb[NFTA_VERDICT_CHAIN])
+ if (tb[NFTA_VERDICT_CHAIN]) {
+ chain = nft_chain_lookup(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN],
+ genmask);
+ } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN_ID],
+ genmask);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
return -EINVAL;
- chain = nft_chain_lookup(ctx->net, ctx->table,
- tb[NFTA_VERDICT_CHAIN], genmask);
+ }
+
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
return -EOPNOTSUPP;
+ if (nft_chain_is_bound(chain))
+ return -EINVAL;
+ if (desc->flags & NFT_DATA_DESC_SETELEM &&
+ chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
- chain->use++;
data->verdict.chain = chain;
break;
+ default:
+ return -EINVAL;
}
desc->len = sizeof(data->verdict);
- desc->type = NFT_DATA_VERDICT;
+
return 0;
}
static void nft_verdict_uninit(const struct nft_data *data)
{
+ struct nft_chain *chain;
+
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use--;
+ chain = data->verdict.chain;
+ nft_use_dec(&chain->use);
break;
}
}
@@ -8279,20 +11846,25 @@ nla_put_failure:
}
static int nft_value_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
- struct nft_data_desc *desc, const struct nlattr *nla)
+ struct nft_data *data, struct nft_data_desc *desc,
+ const struct nlattr *nla)
{
unsigned int len;
len = nla_len(nla);
if (len == 0)
return -EINVAL;
- if (len > size)
+ if (len > desc->size)
return -EOVERFLOW;
+ if (desc->len) {
+ if (len != desc->len)
+ return -EINVAL;
+ } else {
+ desc->len = len;
+ }
nla_memcpy(data->data, nla, len);
- desc->type = NFT_DATA_VALUE;
- desc->len = len;
+
return 0;
}
@@ -8312,7 +11884,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
*
* @ctx: context of the expression using the data
* @data: destination struct nft_data
- * @size: maximum data length
* @desc: data description
* @nla: netlink attribute containing data
*
@@ -8322,24 +11893,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
* The caller can indicate that it only wants to accept data of type
* NFT_DATA_VALUE by passing NULL for the ctx argument.
*/
-int nft_data_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;
+ if (WARN_ON_ONCE(!desc->size))
+ return -EINVAL;
+
err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
nft_data_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_DATA_VALUE])
- return nft_value_init(ctx, data, size, desc,
- tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
+ if (tb[NFTA_DATA_VALUE]) {
+ if (desc->type != NFT_DATA_VALUE)
+ return -EINVAL;
+
+ err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
+ if (desc->type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);
@@ -8392,31 +11974,35 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-int __nft_release_basechain(struct nft_ctx *ctx)
+static void __nft_release_hook(struct net *net, struct nft_table *table)
{
- struct nft_rule *rule, *nr;
+ struct nft_flowtable *flowtable;
+ struct nft_chain *chain;
- if (WARN_ON(!nft_is_base_chain(ctx->chain)))
- return 0;
+ list_for_each_entry(chain, &table->chains, list)
+ __nf_tables_unregister_hook(net, table, chain, true);
+ list_for_each_entry(flowtable, &table->flowtables, list)
+ __nft_unregister_flowtable_net_hooks(net, flowtable,
+ &flowtable->hook_list,
+ true);
+}
- nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
- list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
- list_del(&rule->list);
- ctx->chain->use--;
- nf_tables_rule_release(ctx, rule);
- }
- nft_chain_del(ctx->chain);
- ctx->table->use--;
- nf_tables_chain_destroy(ctx);
+static void __nft_release_hooks(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table;
- return 0;
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
+ __nft_release_hook(net, table);
+ }
}
-EXPORT_SYMBOL_GPL(__nft_release_basechain);
-static void __nft_release_tables(struct net *net)
+static void __nft_release_table(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable, *nf;
- struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
@@ -8426,123 +12012,249 @@ static void __nft_release_tables(struct net *net)
.family = NFPROTO_NETDEV,
};
- list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
- ctx.family = table->family;
+ ctx.family = table->family;
+ ctx.table = table;
+ list_for_each_entry(chain, &table->chains, list) {
+ if (nft_chain_binding(chain))
+ continue;
- list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hook(net, table, chain);
- /* No packets are walking on these chains anymore. */
- ctx.table = table;
- list_for_each_entry(chain, &table->chains, list) {
- ctx.chain = chain;
- list_for_each_entry_safe(rule, nr, &chain->rules, list) {
- list_del(&rule->list);
- chain->use--;
- nf_tables_rule_release(&ctx, rule);
- }
- }
- list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
- list_del(&flowtable->list);
- table->use--;
- nf_tables_flowtable_destroy(flowtable);
- }
- list_for_each_entry_safe(set, ns, &table->sets, list) {
- list_del(&set->list);
- table->use--;
- nft_set_destroy(&ctx, set);
- }
- list_for_each_entry_safe(obj, ne, &table->objects, list) {
- nft_obj_del(obj);
- table->use--;
- nft_obj_destroy(&ctx, obj);
- }
- list_for_each_entry_safe(chain, nc, &table->chains, list) {
- ctx.chain = chain;
- nft_chain_del(chain);
- table->use--;
- nf_tables_chain_destroy(&ctx);
+ ctx.chain = chain;
+ list_for_each_entry_safe(rule, nr, &chain->rules, list) {
+ list_del(&rule->list);
+ nft_use_dec(&chain->use);
+ nf_tables_rule_release(&ctx, rule);
}
+ }
+ list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
+ list_del(&flowtable->list);
+ nft_use_dec(&table->use);
+ nf_tables_flowtable_destroy(flowtable);
+ }
+ list_for_each_entry_safe(set, ns, &table->sets, list) {
+ list_del(&set->list);
+ nft_use_dec(&table->use);
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(&ctx, set);
+
+ nft_set_destroy(&ctx, set);
+ }
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ nft_obj_del(obj);
+ nft_use_dec(&table->use);
+ nft_obj_destroy(&ctx, obj);
+ }
+ list_for_each_entry_safe(chain, nc, &table->chains, list) {
+ nft_chain_del(chain);
+ nft_use_dec(&table->use);
+ nf_tables_chain_destroy(chain);
+ }
+ nf_tables_table_destroy(table);
+}
+
+static void __nft_release_tables(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table, *nt;
+
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
list_del(&table->list);
- nf_tables_table_destroy(&ctx);
+
+ __nft_release_table(net, table);
+ }
+}
+
+static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct nft_table *table, *to_delete[8];
+ struct nftables_pernet *nft_net;
+ struct netlink_notify *n = ptr;
+ struct net *net = n->net;
+ unsigned int deleted;
+ bool restart = false;
+ unsigned int gc_seq;
+
+ if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(net);
+ deleted = 0;
+ mutex_lock(&nft_net->commit_mutex);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ nf_tables_trans_destroy_flush_work(net);
+again:
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table) &&
+ n->portid == table->nlpid) {
+ if (table->flags & NFT_TABLE_F_PERSIST) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ continue;
+ }
+ __nft_release_hook(net, table);
+ list_del_rcu(&table->list);
+ to_delete[deleted++] = table;
+ if (deleted >= ARRAY_SIZE(to_delete))
+ break;
+ }
}
+ if (deleted) {
+ restart = deleted >= ARRAY_SIZE(to_delete);
+ synchronize_rcu();
+ while (deleted)
+ __nft_release_table(net, to_delete[--deleted]);
+
+ if (restart)
+ goto again;
+ }
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return NOTIFY_DONE;
}
+static struct notifier_block nft_nl_notifier = {
+ .notifier_call = nft_rcv_nl_event,
+};
+
static int __net_init nf_tables_init_net(struct net *net)
{
- INIT_LIST_HEAD(&net->nft.tables);
- INIT_LIST_HEAD(&net->nft.commit_list);
- INIT_LIST_HEAD(&net->nft.module_list);
- mutex_init(&net->nft.commit_mutex);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ INIT_LIST_HEAD(&nft_net->tables);
+ INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->destroy_list);
+ INIT_LIST_HEAD(&nft_net->commit_set_list);
+ INIT_LIST_HEAD(&nft_net->binding_list);
+ INIT_LIST_HEAD(&nft_net->module_list);
+ INIT_LIST_HEAD(&nft_net->notify_list);
+ mutex_init(&nft_net->commit_mutex);
net->nft.base_seq = 1;
- net->nft.validate_state = NFT_VALIDATE_SKIP;
+ nft_net->gc_seq = 0;
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
+ INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
return 0;
}
+static void __net_exit nf_tables_pre_exit_net(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ mutex_lock(&nft_net->commit_mutex);
+ __nft_release_hooks(net);
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
static void __net_exit nf_tables_exit_net(struct net *net)
{
- mutex_lock(&net->nft.commit_mutex);
- if (!list_empty(&net->nft.commit_list))
- __nf_tables_abort(net, false);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ unsigned int gc_seq;
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ if (!list_empty(&nft_net->module_list))
+ nf_tables_module_autoload_cleanup(net);
+
+ cancel_work_sync(&nft_net->destroy_work);
__nft_release_tables(net);
- mutex_unlock(&net->nft.commit_mutex);
- WARN_ON_ONCE(!list_empty(&net->nft.tables));
- WARN_ON_ONCE(!list_empty(&net->nft.module_list));
+
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->tables));
+ WARN_ON_ONCE(!list_empty(&nft_net->module_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
+}
+
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+ flush_work(&trans_gc_work);
}
static struct pernet_operations nf_tables_net_ops = {
- .init = nf_tables_init_net,
- .exit = nf_tables_exit_net,
+ .init = nf_tables_init_net,
+ .pre_exit = nf_tables_pre_exit_net,
+ .exit = nf_tables_exit_net,
+ .exit_batch = nf_tables_exit_batch,
+ .id = &nf_tables_net_id,
+ .size = sizeof(struct nftables_pernet),
};
static int __init nf_tables_module_init(void)
{
int err;
- spin_lock_init(&nf_tables_destroy_list_lock);
+ BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0);
+
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
err = nft_chain_filter_init();
if (err < 0)
- goto err1;
+ goto err_chain_filter;
err = nf_tables_core_module_init();
if (err < 0)
- goto err2;
+ goto err_core_module;
err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
if (err < 0)
- goto err3;
+ goto err_netdev_notifier;
err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params);
if (err < 0)
- goto err4;
+ goto err_rht_objname;
err = nft_offload_init();
if (err < 0)
- goto err5;
+ goto err_offload;
+
+ err = netlink_register_notifier(&nft_nl_notifier);
+ if (err < 0)
+ goto err_netlink_notifier;
/* must be last */
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
- goto err6;
+ goto err_nfnl_subsys;
nft_chain_route_init();
return err;
-err6:
+
+err_nfnl_subsys:
+ netlink_unregister_notifier(&nft_nl_notifier);
+err_netlink_notifier:
nft_offload_exit();
-err5:
+err_offload:
rhltable_destroy(&nft_objname_ht);
-err4:
+err_rht_objname:
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
-err3:
+err_netdev_notifier:
nf_tables_core_module_exit();
-err2:
+err_core_module:
nft_chain_filter_fini();
-err1:
+err_chain_filter:
unregister_pernet_subsys(&nf_tables_net_ops);
return err;
}
@@ -8550,12 +12262,13 @@ err1:
static void __exit nf_tables_module_exit(void)
{
nfnetlink_subsys_unregister(&nf_tables_subsys);
+ netlink_unregister_notifier(&nft_nl_notifier);
nft_offload_exit();
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
- cancel_work_sync(&trans_destroy_work);
+ cancel_work_sync(&trans_gc_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
nf_tables_core_module_exit();
@@ -8566,4 +12279,5 @@ module_exit(nf_tables_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Framework for packet filtering and classification");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 96c74c4c7176..6557a4018c09 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -21,43 +21,126 @@
#include <net/netfilter/nf_log.h>
#include <net/netfilter/nft_meta.h>
-static noinline void __nft_trace_packet(struct nft_traceinfo *info,
- const struct nft_chain *chain,
- enum nft_trace_types type)
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static struct static_key_false nf_tables_skip_direct_calls;
+
+static inline bool nf_skip_indirect_calls(void)
{
- const struct nft_pktinfo *pkt = info->pkt;
+ return static_branch_likely(&nf_tables_skip_direct_calls);
+}
- if (!info->trace || !pkt->skb->nf_trace)
+static inline void __init nf_skip_indirect_calls_enable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+ static_branch_enable(&nf_tables_skip_direct_calls);
+}
+#else
+static inline void nf_skip_indirect_calls_enable(void) { }
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+
+static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info,
+ enum nft_trace_types type)
+{
+ if (!info->trace || !info->nf_trace)
return;
- info->chain = chain;
info->type = type;
- nft_trace_notify(info);
+ nft_trace_notify(pkt, verdict, rule, info);
}
-static inline void nft_trace_packet(struct nft_traceinfo *info,
- const struct nft_chain *chain,
- const struct nft_rule *rule,
+static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ struct nft_verdict *verdict,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
- info->rule = rule;
- __nft_trace_packet(info, chain, type);
+ info->nf_trace = pkt->skb->nf_trace;
+ __nft_trace_packet(pkt, verdict, rule, info, type);
}
}
+static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info)
+{
+ if (static_branch_unlikely(&nft_trace_enabled))
+ info->nf_trace = pkt->skb->nf_trace;
+}
+
+static void nft_bitwise_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+
+ *dst = (*src & priv->mask) ^ priv->xor;
+}
+
static void nft_cmp_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- u32 mask = nft_cmp_fast_mask(priv->len);
- if ((regs->data[priv->sreg] & mask) == priv->data)
+ if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv)
return;
regs->verdict.code = NFT_BREAK;
}
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+ const u64 *mask = (const u64 *)&priv->mask;
+ const u64 *data = (const u64 *)&priv->data;
+
+ if (((reg_data[0] & mask[0]) == data[0] &&
+ ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
+static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ enum nft_trace_types type;
+
+ switch (regs->verdict.code & NF_VERDICT_MASK) {
+ case NFT_CONTINUE:
+ case NFT_RETURN:
+ type = NFT_TRACETYPE_RETURN;
+ break;
+ case NF_STOLEN:
+ type = NFT_TRACETYPE_RULE;
+ /* can't access skb->nf_trace; use copy */
+ break;
+ default:
+ type = NFT_TRACETYPE_RULE;
+
+ if (info->trace)
+ info->nf_trace = pkt->skb->nf_trace;
+ break;
+ }
+
+ __nft_trace_packet(pkt, &regs->verdict, rule, info, type);
+}
+
+static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ if (static_branch_unlikely(&nft_trace_enabled))
+ __nft_trace_verdict(pkt, info, rule, regs);
+}
+
static bool nft_payload_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -70,9 +153,9 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb);
else {
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
return false;
- ptr = skb_network_header(skb) + pkt->xt.thoff;
+ ptr = skb->data + nft_thoff(pkt);
}
ptr += priv->offset;
@@ -101,7 +184,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
base_chain = nft_base_chain(chain);
- rcu_read_lock();
pstats = READ_ONCE(base_chain->stats);
if (pstats) {
local_bh_disable();
@@ -112,70 +194,92 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
u64_stats_update_end(&stats->syncp);
local_bh_enable();
}
- rcu_read_unlock();
}
struct nft_jumpstack {
- const struct nft_chain *chain;
- struct nft_rule *const *rules;
+ const struct nft_rule_dp *rule;
};
static void expr_call_ops_eval(const struct nft_expr *expr,
struct nft_regs *regs,
struct nft_pktinfo *pkt)
{
-#ifdef CONFIG_RETPOLINE
- unsigned long e = (unsigned long)expr->ops->eval;
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ unsigned long e;
+
+ if (nf_skip_indirect_calls())
+ goto indirect_call;
+
+ e = (unsigned long)expr->ops->eval;
#define X(e, fun) \
do { if ((e) == (unsigned long)(fun)) \
return fun(expr, regs, pkt); } while (0)
X(e, nft_payload_eval);
X(e, nft_cmp_eval);
+ X(e, nft_counter_eval);
X(e, nft_meta_get_eval);
X(e, nft_lookup_eval);
+#if IS_ENABLED(CONFIG_NFT_CT)
+ X(e, nft_ct_get_fast_eval);
+#endif
X(e, nft_range_eval);
X(e, nft_immediate_eval);
X(e, nft_byteorder_eval);
X(e, nft_dynset_eval);
X(e, nft_rt_get_eval);
X(e, nft_bitwise_eval);
+ X(e, nft_objref_eval);
+ X(e, nft_objref_map_eval);
#undef X
-#endif /* CONFIG_RETPOLINE */
+indirect_call:
+#endif /* CONFIG_MITIGATION_RETPOLINE */
expr->ops->eval(expr, regs, pkt);
}
+#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0]
+#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size
+#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen]
+
+#define nft_rule_dp_for_each_expr(expr, last, rule) \
+ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
+ (expr) != (last); \
+ (expr) = nft_rule_expr_next(expr))
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct net *net = nft_net(pkt);
- struct nft_rule *const *rules;
- const struct nft_rule *rule;
const struct nft_expr *expr, *last;
+ const struct nft_rule_dp *rule;
struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
+ struct nft_rule_blob *blob;
struct nft_traceinfo info;
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
- nft_trace_init(&info, pkt, &regs.verdict, basechain);
+ nft_trace_init(&info, pkt, basechain);
do_chain:
if (genbit)
- rules = rcu_dereference(chain->rules_gen_1);
+ blob = rcu_dereference(chain->blob_gen_1);
else
- rules = rcu_dereference(chain->rules_gen_0);
+ blob = rcu_dereference(chain->blob_gen_0);
+ rule = (struct nft_rule_dp *)blob->data;
next_rule:
- rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- for (; *rules ; rules++) {
- rule = *rules;
- nft_rule_for_each_expr(expr, last, rule) {
+ for (; !rule->is_last ; rule = nft_rule_next(rule)) {
+ nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_cmp16_fast_ops)
+ nft_cmp16_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_bitwise_fast_ops)
+ nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
expr_call_ops_eval(expr, &regs, pkt);
@@ -187,60 +291,58 @@ next_rule:
switch (regs.verdict.code) {
case NFT_BREAK:
regs.verdict.code = NFT_CONTINUE;
+ nft_trace_copy_nftrace(pkt, &info);
continue;
case NFT_CONTINUE:
- nft_trace_packet(&info, chain, rule,
+ nft_trace_packet(pkt, &regs.verdict, &info, rule,
NFT_TRACETYPE_RULE);
continue;
}
break;
}
+ nft_trace_verdict(pkt, &info, rule, &regs);
+
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
- case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
return regs.verdict.code;
+ case NF_DROP:
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
}
switch (regs.verdict.code) {
case NFT_JUMP:
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
- jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rules = rules + 1;
+ jumpstack[stackptr].rule = nft_rule_next(rule);
stackptr++;
- /* fall through */
+ fallthrough;
case NFT_GOTO:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
-
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
case NFT_RETURN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RETURN);
break;
default:
- WARN_ON(1);
+ WARN_ON_ONCE(1);
}
if (stackptr > 0) {
stackptr--;
- chain = jumpstack[stackptr].chain;
- rules = jumpstack[stackptr].rules;
+ rule = jumpstack[stackptr].rule;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
+ nft_trace_packet(pkt, &regs.verdict, &info, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
+ if (nft_base_chain(basechain)->policy == NF_DROP)
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
+
return nft_base_chain(basechain)->policy;
}
EXPORT_SYMBOL_GPL(nft_do_chain);
@@ -257,18 +359,25 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_meta_type,
&nft_rt_type,
&nft_exthdr_type,
+ &nft_last_type,
+ &nft_counter_type,
+ &nft_objref_type,
+ &nft_inner_type,
};
static struct nft_object_type *nft_basic_objects[] = {
#ifdef CONFIG_NETWORK_SECMARK
&nft_secmark_obj_type,
#endif
+ &nft_counter_obj_type,
};
int __init nf_tables_core_module_init(void)
{
int err, i, j = 0;
+ nft_counter_init_seqcount();
+
for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
err = nft_register_obj(nft_basic_objects[i]);
if (err)
@@ -281,6 +390,8 @@ int __init nf_tables_core_module_init(void)
goto err;
}
+ nf_skip_indirect_calls_enable();
+
return 0;
err:
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 185fc82c99aa..fd30e205de84 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -28,6 +28,63 @@ static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
return flow;
}
+void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
+ enum flow_dissector_key_id addr_type)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_flow_key *mask = &match->mask;
+ struct nft_flow_key *key = &match->key;
+
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))
+ return;
+
+ key->control.addr_type = addr_type;
+ mask->control.addr_type = 0xffff;
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
+ offsetof(struct nft_flow_key, control);
+}
+
+struct nft_offload_ethertype {
+ __be16 value;
+ __be16 mask;
+};
+
+static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_offload_ethertype ethertype = {
+ .value = match->key.basic.n_proto,
+ .mask = match->mask.basic.n_proto,
+ };
+
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) &&
+ (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) ||
+ match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.cvlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid;
+ match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid;
+ match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] =
+ offsetof(struct nft_flow_key, cvlan);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN);
+ } else if (match->dissector.used_keys &
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) &&
+ (match->key.basic.n_proto == htons(ETH_P_8021Q) ||
+ match->key.basic.n_proto == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.vlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
+ offsetof(struct nft_flow_key, vlan);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
+ }
+}
+
struct nft_flow_rule *nft_flow_rule_create(struct net *net,
const struct nft_rule *rule)
{
@@ -37,8 +94,9 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr->ops && expr != nft_expr_last(rule)) {
- if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
+ while (nft_expr_more(rule, expr)) {
+ if (expr->ops->offload_action &&
+ expr->ops->offload_action(expr))
num_actions++;
expr = nft_expr_next(expr);
@@ -61,7 +119,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
ctx->net = net;
ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
- while (expr->ops && expr != nft_expr_last(rule)) {
+ while (nft_expr_more(rule, expr)) {
if (!expr->ops->offload) {
err = -EOPNOTSUPP;
goto err_out;
@@ -72,6 +130,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
expr = nft_expr_next(expr);
}
+ nft_flow_rule_transfer_vlan(ctx, flow);
+
flow->proto = ctx->dep.l3num;
kfree(ctx);
@@ -149,7 +209,7 @@ static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
return 0;
}
-int nft_chain_offload_priority(struct nft_base_chain *basechain)
+static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
{
if (basechain->ops.priority <= 0 ||
basechain->ops.priority > USHRT_MAX)
@@ -158,6 +218,31 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain)
return 0;
}
+bool nft_chain_offload_support(const struct nft_base_chain *basechain)
+{
+ struct nf_hook_ops *ops;
+ struct net_device *dev;
+ struct nft_hook *hook;
+
+ if (nft_chain_offload_priority(basechain) < 0)
+ return false;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->pf != NFPROTO_NETDEV ||
+ ops->hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = ops->dev;
+ if (!dev->netdev_ops->ndo_setup_tc &&
+ !flow_indr_dev_exists())
+ return false;
+ }
+ }
+
+ return true;
+}
+
static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
const struct nft_base_chain *basechain,
const struct nft_rule *rule,
@@ -180,26 +265,56 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
cls_flow->rule = flow->rule;
}
-static int nft_flow_offload_rule(struct nft_chain *chain,
- struct nft_rule *rule,
- struct nft_flow_rule *flow,
- enum flow_cls_command command)
+static int nft_flow_offload_cmd(const struct nft_chain *chain,
+ const struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command,
+ struct flow_cls_offload *cls_flow)
{
struct netlink_ext_ack extack = {};
- struct flow_cls_offload cls_flow;
struct nft_base_chain *basechain;
if (!nft_is_base_chain(chain))
return -EOPNOTSUPP;
basechain = nft_base_chain(chain);
- nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack,
+ nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack,
command);
- return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow,
+ return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow,
&basechain->flow_block.cb_list);
}
+static int nft_flow_offload_rule(const struct nft_chain *chain,
+ struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command)
+{
+ struct flow_cls_offload cls_flow;
+
+ return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow);
+}
+
+int nft_flow_rule_stats(const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct flow_cls_offload cls_flow = {};
+ struct nft_expr *expr, *next;
+ int err;
+
+ err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS,
+ &cls_flow);
+ if (err < 0)
+ return err;
+
+ nft_rule_for_each_expr(expr, next, rule) {
+ if (expr->ops->offload_stats)
+ expr->ops->offload_stats(expr, &cls_flow.stats);
+ }
+
+ return 0;
+}
+
static int nft_flow_offload_bind(struct flow_block_offload *bo,
struct nft_base_chain *basechain)
{
@@ -265,6 +380,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo,
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
+ bo->cb_list_head = &basechain->flow_block.cb_list;
INIT_LIST_HEAD(&bo->cb_list);
}
@@ -290,15 +406,18 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
struct nft_base_chain *basechain = block_cb->indr.data;
struct net_device *dev = block_cb->indr.dev;
struct netlink_ext_ack extack = {};
+ struct nftables_pernet *nft_net;
struct net *net = dev_net(dev);
struct flow_block_offload bo;
nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND,
basechain, &extack);
- mutex_lock(&net->nft.commit_mutex);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ list_del(&block_cb->driver_list);
list_move(&block_cb->list, &bo.cb_list);
nft_flow_offload_unbind(&bo, basechain);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
}
static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
@@ -311,7 +430,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack);
- err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo,
+ err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo,
nft_indr_block_cleanup);
if (err < 0)
return err;
@@ -322,8 +441,6 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
return nft_block_setup(basechain, &bo, cmd);
}
-#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
-
static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
struct net_device *dev,
enum flow_block_command cmd)
@@ -342,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain,
const struct net_device *this_dev,
enum flow_block_command cmd)
{
- struct net_device *dev;
+ struct nf_hook_ops *ops;
struct nft_hook *hook;
int err, i = 0;
list_for_each_entry(hook, &basechain->hook_list, list) {
- dev = hook->ops.dev;
- if (this_dev && this_dev != dev)
- continue;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (this_dev && this_dev != ops->dev)
+ continue;
- err = nft_chain_offload_cmd(basechain, dev, cmd);
- if (err < 0 && cmd == FLOW_BLOCK_BIND) {
- if (!this_dev)
- goto err_flow_block;
+ err = nft_chain_offload_cmd(basechain, ops->dev, cmd);
+ if (err < 0 && cmd == FLOW_BLOCK_BIND) {
+ if (!this_dev)
+ goto err_flow_block;
- return err;
+ return err;
+ }
+ i++;
}
- i++;
}
return 0;
err_flow_block:
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (i-- <= 0)
- break;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
- dev = hook->ops.dev;
- nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
+ nft_chain_offload_cmd(basechain, ops->dev,
+ FLOW_BLOCK_UNBIND);
+ }
}
return err;
}
@@ -396,41 +516,42 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
static void nft_flow_rule_offload_abort(struct net *net,
struct nft_trans *trans)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
int err = 0;
- list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
+ list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) {
+ if (trans->table->family != NFPROTO_NETDEV)
continue;
switch (trans->msg_type) {
case NFT_MSG_NEWCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
nft_trans_chain_update(trans))
continue;
- err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
FLOW_BLOCK_UNBIND);
break;
case NFT_MSG_DELCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_chain(trans->ctx.chain, NULL,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
FLOW_BLOCK_BIND);
break;
case NFT_MSG_NEWRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
NULL, FLOW_CLS_DESTROY);
break;
case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
nft_trans_flow_rule(trans),
FLOW_CLS_REPLACE);
@@ -444,51 +565,52 @@ static void nft_flow_rule_offload_abort(struct net *net,
int nft_flow_rule_offload_commit(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;
int err = 0;
u8 policy;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->table->family != NFPROTO_NETDEV)
continue;
switch (trans->msg_type) {
case NFT_MSG_NEWCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
nft_trans_chain_update(trans))
continue;
policy = nft_trans_chain_policy(trans);
- err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
FLOW_BLOCK_BIND);
break;
case NFT_MSG_DELCHAIN:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
policy = nft_trans_chain_policy(trans);
- err = nft_flow_offload_chain(trans->ctx.chain, &policy,
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
FLOW_BLOCK_UNBIND);
break;
case NFT_MSG_NEWRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- if (trans->ctx.flags & NLM_F_REPLACE ||
- !(trans->ctx.flags & NLM_F_APPEND)) {
+ if (trans->flags & NLM_F_REPLACE ||
+ !(trans->flags & NLM_F_APPEND)) {
err = -EOPNOTSUPP;
break;
}
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
nft_trans_flow_rule(trans),
FLOW_CLS_REPLACE);
break;
case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
continue;
- err = nft_flow_offload_rule(trans->ctx.chain,
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
nft_trans_rule(trans),
NULL, FLOW_CLS_DESTROY);
break;
@@ -500,35 +622,18 @@ int nft_flow_rule_offload_commit(struct net *net)
}
}
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- if (trans->ctx.family != NFPROTO_NETDEV)
- continue;
-
- switch (trans->msg_type) {
- case NFT_MSG_NEWRULE:
- case NFT_MSG_DELRULE:
- if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD))
- continue;
-
- nft_flow_rule_destroy(nft_trans_flow_rule(trans));
- break;
- default:
- break;
- }
- }
-
return err;
}
-static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
+static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net,
+ struct net_device *dev)
{
struct nft_base_chain *basechain;
- struct net *net = dev_net(dev);
struct nft_hook *hook, *found;
const struct nft_table *table;
struct nft_chain *chain;
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV)
continue;
@@ -540,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
found = NULL;
basechain = nft_base_chain(chain);
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev != dev)
+ if (!nft_hook_find_ops(hook, dev))
continue;
found = hook;
@@ -560,19 +665,21 @@ static int nft_offload_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
struct net *net = dev_net(dev);
struct nft_chain *chain;
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- mutex_lock(&net->nft.commit_mutex);
- chain = __nft_offload_get_chain(dev);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ chain = __nft_offload_get_chain(nft_net, dev);
if (chain)
nft_flow_block_chain(nft_base_chain(chain), dev,
FLOW_BLOCK_UNBIND);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 87b36da5cd98..a88abae5a9de 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -7,7 +7,7 @@
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/hash.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/skbuff.h>
@@ -15,6 +15,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@@ -25,22 +26,6 @@
DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
EXPORT_SYMBOL_GPL(nft_trace_enabled);
-static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
-{
- __be32 id;
-
- /* using skb address as ID results in a limited number of
- * values (and quick reuse).
- *
- * So we attempt to use as many skb members that will not
- * change while skb is with netfilter.
- */
- id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
- skb->skb_iif);
-
- return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
-}
-
static int trace_fill_header(struct sk_buff *nlskb, u16 type,
const struct sk_buff *skb,
int off, unsigned int len)
@@ -106,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
return 0;
}
+static int nf_trace_fill_ct_info(struct sk_buff *nlskb,
+ const struct sk_buff *skb)
+{
+ const struct nf_ct_hook *ct_hook;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ u32 state;
+
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook)
+ return 0;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) {
+ if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */
+ return 0;
+
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ } else {
+ state = NF_CT_STATE_BIT(ctinfo);
+ }
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state)))
+ return -1;
+
+ if (ct) {
+ u32 id = ct_hook->get_id(&ct->ct_general);
+ u32 status = READ_ONCE(ct->status);
+ u8 dir = CTINFO2DIR(ctinfo);
+
+ if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir))
+ return -1;
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id))
+ return -1;
+
+ /* Kernel implementation detail, withhold this from userspace for now */
+ status &= ~IPS_NAT_CLASH;
+
+ if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status)))
+ return -1;
+ }
+
+ return 0;
+}
+
static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
@@ -113,17 +144,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
int off = skb_network_offset(skb);
unsigned int len, nh_end;
- nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+ nh_end = pkt->flags & NFT_PKTINFO_L4PROTO ? nft_thoff(pkt) : skb->len;
len = min_t(unsigned int, nh_end - skb_network_offset(skb),
NFT_TRACETYPE_NETWORK_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
return -1;
- if (pkt->tprot_set) {
- len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ if (pkt->flags & NFT_PKTINFO_L4PROTO) {
+ len = min_t(unsigned int, skb->len - nft_thoff(pkt),
NFT_TRACETYPE_TRANSPORT_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
- pkt->xt.thoff, len))
+ nft_thoff(pkt), len))
return -1;
}
@@ -140,9 +171,11 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
}
static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
const struct nft_traceinfo *info)
{
- if (!info->rule)
+ if (!rule || rule->is_last)
return 0;
/* a continue verdict with ->type == RETURN means that this is
@@ -151,15 +184,16 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
* Since no rule matched, the ->rule pointer is invalid.
*/
if (info->type == NFT_TRACETYPE_RETURN &&
- info->verdict->code == NFT_CONTINUE)
+ verdict->code == NFT_CONTINUE)
return 0;
return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
- cpu_to_be64(info->rule->handle),
+ cpu_to_be64(rule->handle),
NFTA_TRACE_PAD);
}
-static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
+ struct nft_traceinfo *info)
{
switch (info->type) {
case NFT_TRACETYPE_RETURN:
@@ -169,7 +203,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return false;
}
- switch (info->verdict->code) {
+ switch (verdict->code) {
case NFT_JUMP:
case NFT_GOTO:
break;
@@ -180,26 +214,54 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return true;
}
-void nft_trace_notify(struct nft_traceinfo *info)
+static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule,
+ const struct nft_traceinfo *info)
{
- const struct nft_pktinfo *pkt = info->pkt;
- struct nfgenmsg *nfmsg;
+ const struct nft_rule_dp_last *last;
+
+ if (!rule)
+ return &info->basechain->chain;
+
+ while (!rule->is_last)
+ rule = nft_rule_next(rule);
+
+ last = (const struct nft_rule_dp_last *)rule;
+
+ if (WARN_ON_ONCE(!last->chain))
+ return &info->basechain->chain;
+
+ return last->chain;
+}
+
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info)
+{
+ const struct nft_chain *chain;
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
+ u32 mark = 0;
u16 event;
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
return;
+ chain = nft_trace_get_chain(rule, info);
+
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
- nla_total_size(strlen(info->chain->table->name)) +
- nla_total_size(strlen(info->chain->name)) +
+ nla_total_size(strlen(chain->table->name)) +
+ nla_total_size(strlen(chain->name)) +
nla_total_size_64bit(sizeof(__be64)) + /* rule handle */
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
- nla_total_size(sizeof(u32)) + /* id */
+ nla_total_size(sizeof(u32)) + /* ct id */
+ nla_total_size(sizeof(u8)) + /* ct direction */
+ nla_total_size(sizeof(u32)) + /* ct state */
+ nla_total_size(sizeof(u32)) + /* ct status */
+ nla_total_size(sizeof(u32)) + /* trace id */
nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
@@ -211,39 +273,35 @@ void nft_trace_notify(struct nft_traceinfo *info)
nla_total_size(sizeof(u32)) + /* nfproto */
nla_total_size(sizeof(u32)); /* policy */
- if (nft_trace_have_verdict_chain(info))
- size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+ if (nft_trace_have_verdict_chain(verdict, info))
+ size += nla_total_size(strlen(verdict->chain->name)); /* jump target */
skb = nlmsg_new(size, GFP_ATOMIC);
if (!skb)
return;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE);
- nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family,
+ NFNETLINK_V0, 0);
if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = info->basechain->type->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
goto nla_put_failure;
- if (trace_fill_id(skb, pkt->skb))
+ if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
+ if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name))
+ if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
goto nla_put_failure;
- if (nf_trace_fill_rule_info(skb, info))
+ if (nf_trace_fill_rule_info(skb, verdict, rule, info))
goto nla_put_failure;
switch (info->type) {
@@ -251,19 +309,31 @@ void nft_trace_notify(struct nft_traceinfo *info)
case __NFT_TRACETYPE_MAX:
break;
case NFT_TRACETYPE_RETURN:
- case NFT_TRACETYPE_RULE:
- if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+ case NFT_TRACETYPE_RULE: {
+ unsigned int v;
+
+ if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict))
goto nla_put_failure;
+
+ /* pkt->skb undefined iff NF_STOLEN, disable dump */
+ v = verdict->code & NF_VERDICT_MASK;
+ if (v == NF_STOLEN)
+ info->packet_dumped = true;
+ else
+ mark = pkt->skb->mark;
+
break;
+ }
case NFT_TRACETYPE_POLICY:
+ mark = pkt->skb->mark;
+
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
- if (pkt->skb->mark &&
- nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark)))
goto nla_put_failure;
if (!info->packet_dumped) {
@@ -272,6 +342,10 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nf_trace_fill_pkt_info(skb, pkt))
goto nla_put_failure;
+
+ if (nf_trace_fill_ct_info(skb, pkt->skb))
+ goto nla_put_failure;
+
info->packet_dumped = true;
}
@@ -285,12 +359,20 @@ void nft_trace_notify(struct nft_traceinfo *info)
}
void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
- const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
+ static siphash_key_t trace_key __read_mostly;
+ struct sk_buff *skb = pkt->skb;
+
info->basechain = nft_base_chain(chain);
info->trace = true;
+ info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
- info->pkt = pkt;
- info->verdict = verdict;
+
+ net_get_random_once(&trace_key, sizeof(trace_key));
+
+ info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
+ skb_get_hash_net(nft_net(pkt), skb),
+ skb->skb_iif,
+ &trace_key);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 99127e2d95a8..811d02b4c4f7 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -28,11 +28,14 @@
#include <linux/sched/signal.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
+#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+MODULE_DESCRIPTION("Netfilter messages via netlink socket");
#define nfnl_dereference_protected(id) \
rcu_dereference_protected(table[(id)].subsys, \
@@ -40,11 +43,39 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
#define NFNL_MAX_ATTR_COUNT 32
+static unsigned int nfnetlink_pernet_id __read_mostly;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static DEFINE_SPINLOCK(nfnl_grp_active_lock);
+#endif
+
+struct nfnl_net {
+ struct sock *nfnl;
+};
+
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
} table[NFNL_SUBSYS_COUNT];
+static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT];
+
+static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = {
+ [NFNL_SUBSYS_NONE] = "nfnl_subsys_none",
+ [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink",
+ [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp",
+ [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue",
+ [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog",
+ [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf",
+ [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset",
+ [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct",
+ [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout",
+ [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper",
+ [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables",
+ [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat",
+ [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook",
+};
+
static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
@@ -57,6 +88,11 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
};
+static struct nfnl_net *nfnl_pernet(struct net *net)
+{
+ return net_generic(net, nfnetlink_pernet_id);
+}
+
void nfnl_lock(__u8 subsys_id)
{
mutex_lock(&table[subsys_id].mutex);
@@ -131,30 +167,51 @@ nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
- return netlink_has_listeners(net->nfnl, group);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_has_listeners(nfnlnet->nfnl, group);
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
- return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- return netlink_set_err(net->nfnl, portid, group, error);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_set_err(nfnlnet->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
- int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
{
- return netlink_unicast(net->nfnl, skb, portid, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+ int err;
+
+ err = nlmsg_unicast(nfnlnet->nfnl, skb, portid);
+ if (err == -EAGAIN)
+ err = -ENOBUFS;
+
+ return err;
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
+ __u32 group, gfp_t allocation)
+{
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_broadcast);
+
/* Process one complete nfnetlink message. */
static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
@@ -171,6 +228,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
type = nlh->nlmsg_type;
replay:
rcu_read_lock();
+
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_MODULES
@@ -194,11 +252,19 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = extack,
+ };
/* Sanity-check NFNL_MAX_ATTR_COUNT */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -214,24 +280,32 @@ replay:
return err;
}
- if (nc->call_rcu) {
- err = nc->call_rcu(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
+ if (!nc->call) {
rcu_read_unlock();
- } else {
+ return -EINVAL;
+ }
+
+ switch (nc->type) {
+ case NFNL_CB_RCU:
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
+ rcu_read_unlock();
+ break;
+ case NFNL_CB_MUTEX:
rcu_read_unlock();
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
- nfnetlink_find_client(type, ss) != nc)
+ nfnetlink_find_client(type, ss) != nc) {
+ nfnl_unlock(subsys_id);
err = -EAGAIN;
- else if (nc->call)
- err = nc->call(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
- else
- err = -EINVAL;
+ break;
+ }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
nfnl_unlock(subsys_id);
+ break;
+ default:
+ rcu_read_unlock();
+ err = -EINVAL;
+ break;
}
if (err == -EAGAIN)
goto replay;
@@ -302,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
struct netlink_ext_ack extack;
+ struct nlmsghdr *onlh = nlh;
LIST_HEAD(err_list);
u32 status;
int err;
@@ -310,8 +385,9 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
-
+replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ nlh = onlh;
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -328,31 +404,36 @@ replay:
{
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
}
if (!ss->valid_genid || !ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!try_module_get(ss->owner)) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!ss->valid_genid(net, genid)) {
module_put(ss->owner);
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
nfnl_unlock(subsys_id);
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
+ }
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -409,12 +490,25 @@ replay:
goto ack;
}
+ if (nc->type != NFNL_CB_BATCH) {
+ err = -EINVAL;
+ goto ack;
+ }
+
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
+ u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
int attrlen = nlh->nlmsg_len - min_len;
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = &extack,
+ };
/* Sanity-check NFTA_MAX_ATTR */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -425,15 +519,11 @@ replay:
err = nla_parse_deprecated(cda,
ss->cb[cb_id].attr_count,
attr, attrlen,
- ss->cb[cb_id].policy, NULL);
+ ss->cb[cb_id].policy, &extack);
if (err < 0)
goto ack;
- if (nc->call_batch) {
- err = nc->call_batch(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- &extack);
- }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
/* The lock was released to autoload some module, we
* have to abort and start from scratch using the
@@ -450,7 +540,8 @@ ack:
* processed, this avoids that the same error is
* reported several times when replaying the batch.
*/
- if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
+ if (err == -ENOMEM ||
+ nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
/* We failed to enqueue an error, reset the
* list of errors and send OOM to userspace
* pointing to the batch header.
@@ -476,9 +567,9 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(net, oskb, true);
+ ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
nfnl_err_reset(&err_list);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
@@ -487,17 +578,32 @@ done:
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
- ss->abort(net, oskb, false);
+ ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
}
} else {
- ss->abort(net, oskb, false);
+ enum nfnl_abort_action abort_action;
+
+ if (status & NFNL_BATCH_FAILURE)
+ abort_action = NFNL_ABORT_NONE;
+ else
+ abort_action = NFNL_ABORT_VALIDATE;
+
+ err = ss->abort(net, oskb, abort_action);
+ if (err == -EAGAIN) {
+ nfnl_err_reset(&err_list);
+ consume_skb(skb);
+ module_put(ss->owner);
+ status |= NFNL_BATCH_FAILURE;
+ goto replay_abort;
+ }
}
- if (ss->cleanup)
- ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
}
@@ -535,7 +641,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
/* Work around old nft using host byte order */
- if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES)
res_id = NFNL_SUBSYS_NFTABLES;
else
res_id = ntohs(nfgenmsg->res_id);
@@ -563,7 +669,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
-#ifdef CONFIG_MODULES
+static void nfnetlink_bind_event(struct net *net, unsigned int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+ u8 v;
+
+ /* All NFNLGRP_CONNTRACK_* group bits fit into u8.
+ * The other groups are not relevant and can be ignored.
+ */
+ if (group >= 8)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ v = READ_ONCE(nf_ctnetlink_has_listener);
+ if ((v & group_bit) == 0) {
+ v |= group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
+ }
+
+ spin_unlock(&nfnl_grp_active_lock);
+#endif
+}
+
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
@@ -579,43 +722,82 @@ static int nfnetlink_bind(struct net *net, int group)
rcu_read_unlock();
if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type);
+
+ nfnetlink_bind_event(net, group);
return 0;
}
+
+static void nfnetlink_unbind(struct net *net, int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+
+ if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ /* ctnetlink_has_listener is u8 */
+ if (group >= 8)
+ return;
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ if (!nfnetlink_has_listeners(net, group)) {
+ u8 v = READ_ONCE(nf_ctnetlink_has_listener);
+
+ v &= ~group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
+ }
+ spin_unlock(&nfnl_grp_active_lock);
#endif
+}
static int __net_init nfnetlink_net_init(struct net *net)
{
- struct sock *nfnl;
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
-#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
-#endif
+ .unbind = nfnetlink_unbind,
};
- nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
- if (!nfnl)
+ nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
+ if (!nfnlnet->nfnl)
return -ENOMEM;
- net->nfnl_stash = nfnl;
- rcu_assign_pointer(net->nfnl, nfnl);
return 0;
}
static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
{
+ struct nfnl_net *nfnlnet;
struct net *net;
- list_for_each_entry(net, net_exit_list, exit_list)
- RCU_INIT_POINTER(net->nfnl, NULL);
- synchronize_net();
- list_for_each_entry(net, net_exit_list, exit_list)
- netlink_kernel_release(net->nfnl_stash);
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nfnlnet = nfnl_pernet(net);
+
+ netlink_kernel_release(nfnlnet->nfnl);
+ }
}
static struct pernet_operations nfnetlink_net_ops = {
.init = nfnetlink_net_init,
.exit_batch = nfnetlink_net_exit_batch,
+ .id = &nfnetlink_pernet_id,
+ .size = sizeof(struct nfnl_net),
};
static int __init nfnetlink_init(void)
@@ -626,7 +808,7 @@ static int __init nfnetlink_init(void)
BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
for (i=0; i<NFNL_SUBSYS_COUNT; i++)
- mutex_init(&table[i].mutex);
+ __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]);
return register_pernet_subsys(&nfnetlink_net_ops);
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 5827117f2635..505f46a32173 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#include <linux/init.h>
#include <linux/module.h>
@@ -16,6 +16,7 @@
#include <linux/errno.h>
#include <net/netlink.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
@@ -41,17 +42,27 @@ struct nfacct_filter {
u32 mask;
};
+struct nfnl_acct_net {
+ struct list_head nfnl_acct_list;
+};
+
+static unsigned int nfnl_acct_net_id __read_mostly;
+
+static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_acct_net_id);
+}
+
#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
-static int nfnl_acct_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *nfacct, *matching = NULL;
- char *acct_name;
unsigned int size = 0;
+ char *acct_name;
u32 flags = 0;
if (!tb[NFACCT_NAME])
@@ -61,11 +72,11 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = nfacct;
@@ -73,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* reset counters if you request a replacement. */
atomic64_set(&matching->pkts, 0);
atomic64_set(&matching->bytes, 0);
@@ -112,7 +123,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
+ nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
@@ -123,7 +134,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
refcount_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list);
return 0;
}
@@ -132,21 +143,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_acct *acct)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
u64 pkts, bytes;
u32 old_flags;
event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFACCT_NAME, acct->name))
goto nla_put_failure;
@@ -188,6 +194,7 @@ static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -199,7 +206,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -264,16 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb)
return 0;
}
-static int nfnl_acct_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_acct_dump,
.start = nfnl_acct_start,
@@ -281,14 +287,14 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
.data = (void *)tb[NFACCT_FILTER],
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!tb[NFACCT_NAME])
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -301,21 +307,18 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
- NFNL_MSG_ACCT_NEW, cur);
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -337,24 +340,23 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
return ret;
}
-static int nfnl_acct_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *cur, *tmp;
int ret = -ENOENT;
char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -377,18 +379,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
};
static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
- [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_NEW] = {
+ .call = nfnl_acct_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_DEL] = {
+ .call = nfnl_acct_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_acct_subsys = {
@@ -402,10 +416,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -457,8 +472,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
kfree_skb(skb);
return;
}
- netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
- GFP_ATOMIC);
+ nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC);
}
int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct)
@@ -488,16 +502,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
static int __net_init nfnl_acct_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfnl_acct_list);
+ INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list);
return 0;
}
static void __net_exit nfnl_acct_net_exit(struct net *net)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *tmp;
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) {
list_del_rcu(&cur->head);
if (refcount_dec_and_test(&cur->refcnt))
@@ -508,6 +523,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
static struct pernet_operations nfnl_acct_ops = {
.init = nfnl_acct_net_init,
.exit = nfnl_acct_net_exit,
+ .id = &nfnl_acct_net_id,
+ .size = sizeof(struct nfnl_acct_net),
};
static int __init nfnl_acct_init(void)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 5b0d0a77379c..97248963a7d3 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -96,11 +96,13 @@ static int
nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
{
struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
if (attr == NULL)
return -EINVAL;
- if (help->helper->data_len == 0)
+ helper = rcu_dereference(help->helper);
+ if (!helper || helper->data_len == 0)
return -EINVAL;
nla_memcpy(help->data, attr, sizeof(help->data));
@@ -111,9 +113,11 @@ static int
nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
{
const struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
- if (help->helper->data_len &&
- nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
goto nla_put_failure;
return 0;
@@ -146,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
!tb[NFCTH_POLICY_EXPECT_TIMEOUT])
return -EINVAL;
- nla_strlcpy(expect_policy->name,
+ nla_strscpy(expect_policy->name,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
@@ -233,7 +237,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
if (ret < 0)
goto err1;
- nla_strlcpy(helper->name,
+ nla_strscpy(helper->name,
tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
if (size > sizeof_field(struct nf_conn_help, data)) {
@@ -380,10 +384,14 @@ static int
nfnl_cthelper_update(const struct nlattr * const tb[],
struct nf_conntrack_helper *helper)
{
+ u32 size;
int ret;
- if (tb[NFCTH_PRIV_DATA_LEN])
- return -EBUSY;
+ if (tb[NFCTH_PRIV_DATA_LEN]) {
+ size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ if (size != helper->data_len)
+ return -EBUSY;
+ }
if (tb[NFCTH_POLICY]) {
ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
@@ -408,10 +416,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return 0;
}
-static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
const char *helper_name;
struct nf_conntrack_helper *cur, *helper = NULL;
@@ -441,7 +447,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
tuple.dst.protonum != cur->tuple.dst.protonum))
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
helper = cur;
@@ -526,20 +532,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_conntrack_helper *helper)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
int status;
event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFCTH_NAME, helper->name))
goto nla_put_failure;
@@ -612,10 +613,8 @@ out:
return skb->len;
}
-static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
int ret = -ENOENT;
struct nf_conntrack_helper *cur;
@@ -628,11 +627,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_cthelper_dump_table,
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (tb[NFCTH_NAME])
@@ -664,29 +663,23 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_CTHELPER_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
-
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
-static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
char *helper_name = NULL;
struct nf_conntrack_helper *cur;
@@ -748,15 +741,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
};
static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
- [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_NEW] = {
+ .call = nfnl_cthelper_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_GET] = {
+ .call = nfnl_cthelper_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_DEL] = {
+ .call = nfnl_cthelper_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index da915c224a82..38d75484e531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -20,6 +20,7 @@
#include <linux/netfilter.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -30,6 +31,24 @@
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
+static unsigned int nfct_timeout_id __read_mostly;
+
+struct ctnl_timeout {
+ struct list_head head;
+ struct list_head free_head;
+ struct rcu_head rcu_head;
+ refcount_t refcnt;
+ char name[CTNL_TIMEOUT_NAME_MAX];
+
+ /* must be at the end */
+ struct nf_ct_timeout timeout;
+};
+
+struct nfct_timeout_pernet {
+ struct list_head nfct_timeout_list;
+ struct list_head nfct_timeout_freelist;
+};
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
@@ -42,6 +61,11 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
[CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
};
+static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net)
+{
+ return net_generic(net, nfct_timeout_id);
+}
+
static int
ctnl_timeout_parse_policy(void *timeout,
const struct nf_conntrack_l4proto *l4proto,
@@ -71,12 +95,11 @@ err:
return ret;
}
-static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_new_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
__u16 l3num;
__u8 l4num;
const struct nf_conntrack_l4proto *l4proto;
@@ -94,11 +117,11 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- list_for_each_entry(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = timeout;
@@ -106,7 +129,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
@@ -116,7 +139,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
return ctnl_timeout_parse_policy(&matching->timeout.data,
matching->timeout.l4proto,
- net, cda[CTA_TIMEOUT_DATA]);
+ info->net,
+ cda[CTA_TIMEOUT_DATA]);
}
return -EBUSY;
@@ -137,8 +161,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
goto err_proto_put;
}
- ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
+ ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto,
+ info->net, cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -146,7 +170,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
timeout->timeout.l3num = l3num;
timeout->timeout.l4proto = l4proto;
refcount_set(&timeout->refcnt, 1);
- list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
+ __module_get(THIS_MODULE);
+ list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list);
return 0;
err:
@@ -160,22 +185,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct ctnl_timeout *timeout)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
htons(timeout->timeout.l3num)) ||
@@ -206,6 +226,7 @@ nla_put_failure:
static int
ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nfct_timeout_pernet *pernet;
struct net *net = sock_net(skb->sk);
struct ctnl_timeout *cur, *last;
@@ -217,7 +238,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) {
+ pernet = nfct_timeout_pernet(net);
+ list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) {
if (last) {
if (cur != last)
continue;
@@ -238,28 +260,27 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_get_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
int ret = -ENOENT;
char *name;
struct ctnl_timeout *cur;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnl_timeout_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!cda[CTA_TIMEOUT_NAME])
return -EINVAL;
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
@@ -272,21 +293,18 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
}
ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -309,30 +327,29 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
return ret;
}
-static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_del_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
struct ctnl_timeout *cur, *tmp;
int ret = -ENOENT;
char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list,
head)
- ctnl_timeout_try_del(net, cur);
+ ctnl_timeout_try_del(info->net, cur);
return 0;
}
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- ret = ctnl_timeout_try_del(net, cur);
+ ret = ctnl_timeout_try_del(info->net, cur);
if (ret < 0)
return ret;
@@ -341,18 +358,15 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
return ret;
}
-static int cttimeout_default_set(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_set(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
__u8 l4num;
int ret;
- if (!cda[CTA_TIMEOUT_L3PROTO] ||
- !cda[CTA_TIMEOUT_L4PROTO] ||
+ if (!cda[CTA_TIMEOUT_L4PROTO] ||
!cda[CTA_TIMEOUT_DATA])
return -EINVAL;
@@ -365,7 +379,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
goto err;
}
- ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
+ ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -382,21 +396,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
const unsigned int *timeouts)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
goto nla_put_failure;
@@ -420,18 +429,16 @@ nla_put_failure:
return -1;
}
-static int cttimeout_default_get(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
unsigned int *timeouts = NULL;
struct sk_buff *skb2;
- int ret, err;
__u16 l3num;
__u8 l4num;
+ int ret;
if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
return -EINVAL;
@@ -440,41 +447,35 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
l4proto = nf_ct_l4proto_find(l4num);
- err = -EOPNOTSUPP;
if (l4proto->l4proto != l4num)
- goto err;
+ return -EOPNOTSUPP;
switch (l4proto->l4proto) {
case IPPROTO_ICMP:
- timeouts = &nf_icmp_pernet(net)->timeout;
+ timeouts = &nf_icmp_pernet(info->net)->timeout;
break;
case IPPROTO_TCP:
- timeouts = nf_tcp_pernet(net)->timeouts;
+ timeouts = nf_tcp_pernet(info->net)->timeouts;
break;
- case IPPROTO_UDP: /* fallthrough */
+ case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- timeouts = nf_udp_pernet(net)->timeouts;
- break;
- case IPPROTO_DCCP:
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- timeouts = nf_dccp_pernet(net)->dccp_timeout;
-#endif
+ timeouts = nf_udp_pernet(info->net)->timeouts;
break;
case IPPROTO_ICMPV6:
- timeouts = &nf_icmpv6_pernet(net)->timeout;
+ timeouts = &nf_icmpv6_pernet(info->net)->timeout;
break;
case IPPROTO_SCTP:
#ifdef CONFIG_NF_CT_PROTO_SCTP
- timeouts = nf_sctp_pernet(net)->timeouts;
+ timeouts = nf_sctp_pernet(info->net)->timeouts;
#endif
break;
case IPPROTO_GRE:
#ifdef CONFIG_NF_CT_PROTO_GRE
- timeouts = nf_gre_pernet(net)->timeouts;
+ timeouts = nf_gre_pernet(info->net)->timeouts;
#endif
break;
case 255:
- timeouts = &nf_generic_pernet(net)->timeout;
+ timeouts = &nf_generic_pernet(info->net)->timeout;
break;
default:
WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto);
@@ -482,50 +483,38 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
}
if (!timeouts)
- goto err;
+ return -EOPNOTSUPP;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
- err = -ENOMEM;
- goto err;
- }
+ if (!skb2)
+ return -ENOMEM;
- ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ ret = cttimeout_default_fill_info(info->net, skb2,
+ NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
l3num, l4proto, timeouts);
if (ret <= 0) {
kfree_skb(skb2);
- err = -ENOMEM;
- goto err;
+ return -ENOMEM;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
-err:
- return err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
const char *name)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *timeout, *matching = NULL;
- list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (!try_module_get(THIS_MODULE))
+ if (!refcount_inc_not_zero(&timeout->refcnt))
goto err;
-
- if (!refcount_inc_not_zero(&timeout->refcnt)) {
- module_put(THIS_MODULE);
- goto err;
- }
matching = timeout;
break;
}
@@ -538,28 +527,43 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t)
struct ctnl_timeout *timeout =
container_of(t, struct ctnl_timeout, timeout);
- if (refcount_dec_and_test(&timeout->refcnt))
+ if (refcount_dec_and_test(&timeout->refcnt)) {
kfree_rcu(timeout, rcu_head);
-
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+ }
}
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
- [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_NEW] = {
+ .call = cttimeout_new_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_GET] = {
+ .call = cttimeout_get_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = {
+ .call = cttimeout_del_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = {
+ .call = cttimeout_default_set,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = {
+ .call = cttimeout_default_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
};
static const struct nfnetlink_subsystem cttimeout_subsys = {
@@ -573,20 +577,39 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
static int __net_init cttimeout_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfct_timeout_list);
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+
+ INIT_LIST_HEAD(&pernet->nfct_timeout_list);
+ INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
+static void __net_exit cttimeout_net_pre_exit(struct net *net)
+{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+ struct ctnl_timeout *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
+ }
+
+ /* core calls synchronize_rcu() after this */
+}
+
static void __net_exit cttimeout_net_exit(struct net *net)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
- nf_ct_unconfirmed_destroy(net);
+ if (list_empty(&pernet->nfct_timeout_freelist))
+ return;
+
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
- list_del_rcu(&cur->head);
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
+ list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -595,7 +618,15 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
+ .pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
+ .id = &nfct_timeout_id,
+ .size = sizeof(struct nfct_timeout_pernet),
+};
+
+static const struct nf_ct_timeout_hooks hooks = {
+ .timeout_find_get = ctnl_timeout_find_get,
+ .timeout_put = ctnl_timeout_put,
};
static int __init cttimeout_init(void)
@@ -612,8 +643,7 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
return 0;
err_out:
@@ -621,14 +651,24 @@ err_out:
return ret;
}
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ return 0;
+}
+
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
+
+ nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
new file mode 100644
index 000000000000..92d869317cba
--- /dev/null
+++ b/net/netfilter/nfnetlink_hook.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021 Red Hat GmbH
+ *
+ * Author: Florian Westphal <fw@strlen.de>
+ */
+
+#include <linux/bpf.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_hook.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <net/sock.h>
+
+static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = {
+ [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 },
+ [NFNLA_HOOK_DEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING,
+ .len = KSYM_NAME_LEN, },
+ [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING,
+ .len = MODULE_NAME_LEN, },
+ [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, },
+};
+
+static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *c)
+{
+ int err;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
+ err = netlink_dump_start(nlsk, skb, nlh, c);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ return err;
+}
+
+struct nfnl_dump_hook_data {
+ char devname[IFNAMSIZ];
+ unsigned long headv;
+ u8 hook;
+};
+
+static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t)
+{
+ struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+ int ret;
+
+ if (!nest)
+ return NULL;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t));
+ if (ret == 0)
+ return nest;
+
+ nla_nest_cancel(nlskb, nest);
+ return NULL;
+}
+
+static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ const struct bpf_prog *prog)
+{
+ struct nlattr *nest, *nest2;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK))
+ return 0;
+
+ if (WARN_ON_ONCE(!prog))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF);
+ if (!nest)
+ return -EMSGSIZE;
+
+ nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest2)
+ goto cancel_nest;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id));
+ if (ret)
+ goto cancel_nest;
+
+ nla_nest_end(nlskb, nest2);
+ nla_nest_end(nlskb, nest);
+ return 0;
+
+cancel_nest:
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+}
+
+static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname,
+ const char *name, u8 family)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest ||
+ nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) ||
+ nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) ||
+ nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) {
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nft_chain *chain)
+{
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!chain))
+ return 0;
+
+ if (!nft_is_active(net, chain))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name,
+ chain->name, chain->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nf_flowtable *nf_ft)
+{
+ struct nft_flowtable *ft =
+ container_of(nf_ft, struct nft_flowtable, data);
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!nf_ft))
+ return 0;
+
+ if (!nft_is_active(net, ft))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name,
+ ft->name, ft->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_dump_one(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ const struct nf_hook_ops *ops,
+ int family, unsigned int seq)
+{
+ u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET);
+ unsigned int portid = NETLINK_CB(nlskb).portid;
+ struct nlmsghdr *nlh;
+ int ret = -EMSGSIZE;
+ u32 hooknum;
+#ifdef CONFIG_KALLSYMS
+ char sym[KSYM_SYMBOL_LEN];
+ char *module_name;
+#endif
+ nlh = nfnl_msg_put(nlskb, portid, seq, event,
+ NLM_F_MULTI, family, NFNETLINK_V0, 0);
+ if (!nlh)
+ goto nla_put_failure;
+
+#ifdef CONFIG_KALLSYMS
+ ret = snprintf(sym, sizeof(sym), "%ps", ops->hook);
+ if (ret >= sizeof(sym)) {
+ ret = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ module_name = strstr(sym, " [");
+ if (module_name) {
+ char *end;
+
+ *module_name = '\0';
+ module_name += 2;
+ end = strchr(module_name, ']');
+ if (end) {
+ *end = 0;
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name);
+ if (ret)
+ goto nla_put_failure;
+ }
+ }
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym);
+ if (ret)
+ goto nla_put_failure;
+#endif
+
+ if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS)
+ hooknum = NF_NETDEV_INGRESS;
+ else
+ hooknum = ops->hooknum;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum));
+ if (ret)
+ goto nla_put_failure;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority));
+ if (ret)
+ goto nla_put_failure;
+
+ switch (ops->hook_ops_type) {
+ case NF_HOOK_OP_NF_TABLES:
+ ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_BPF:
+ ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_NFT_FT:
+ ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_UNDEFINED:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (ret)
+ goto nla_put_failure;
+
+ nlmsg_end(nlskb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_trim(nlskb, nlh);
+ return ret;
+}
+
+static const struct nf_hook_entries *
+nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev)
+{
+ const struct nf_hook_entries *hook_head = NULL;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ struct net_device *netdev;
+#endif
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
+ break;
+ case NFPROTO_IPV6:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
+ break;
+ case NFPROTO_ARP:
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
+ if (hook >= ARRAY_SIZE(net->nf.hooks_arp))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
+#endif
+ break;
+ case NFPROTO_BRIDGE:
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ if (hook >= ARRAY_SIZE(net->nf.hooks_bridge))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+#endif
+ break;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ case NFPROTO_NETDEV:
+ if (hook >= NF_NETDEV_NUMHOOKS)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ netdev = dev_get_by_name_rcu(net, dev);
+ if (!netdev)
+ return ERR_PTR(-ENODEV);
+
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (hook == NF_NETDEV_INGRESS)
+ return rcu_dereference(netdev->nf_hooks_ingress);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hook == NF_NETDEV_EGRESS)
+ return rcu_dereference(netdev->nf_hooks_egress);
+#endif
+ fallthrough;
+#endif
+ default:
+ return ERR_PTR(-EPROTONOSUPPORT);
+ }
+
+ return hook_head;
+}
+
+static int nfnl_hook_dump(struct sk_buff *nlskb,
+ struct netlink_callback *cb)
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nfnl_dump_hook_data *ctx = cb->data;
+ int err, family = nfmsg->nfgen_family;
+ struct net *net = sock_net(nlskb->sk);
+ struct nf_hook_ops * const *ops;
+ const struct nf_hook_entries *e;
+ unsigned int i = cb->args[0];
+
+ rcu_read_lock();
+
+ e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname);
+ if (!e)
+ goto done;
+
+ if (IS_ERR(e)) {
+ cb->seq++;
+ goto done;
+ }
+
+ if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries)
+ cb->seq++;
+
+ ops = nf_hook_entries_get_hook_ops(e);
+
+ for (; i < e->num_hook_entries; i++) {
+ err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family,
+ cb->nlh->nlmsg_seq);
+ if (err)
+ break;
+ }
+
+done:
+ nl_dump_check_consistent(cb, nlmsg_hdr(nlskb));
+ rcu_read_unlock();
+ cb->args[0] = i;
+ return nlskb->len;
+}
+
+static int nfnl_hook_dump_start(struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nlattr * const *nla = cb->data;
+ struct nfnl_dump_hook_data *ctx = NULL;
+ struct net *net = sock_net(cb->skb->sk);
+ u8 family = nfmsg->nfgen_family;
+ char name[IFNAMSIZ] = "";
+ const void *head;
+ u32 hooknum;
+
+ hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM]));
+ if (hooknum > 255)
+ return -EINVAL;
+
+ if (family == NFPROTO_NETDEV) {
+ if (!nla[NFNLA_HOOK_DEV])
+ return -EINVAL;
+
+ nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name));
+ }
+
+ rcu_read_lock();
+ /* Not dereferenced; for consistency check only */
+ head = nfnl_hook_entries_head(family, hooknum, net, name);
+ rcu_read_unlock();
+
+ if (head && IS_ERR(head))
+ return PTR_ERR(head);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ strscpy(ctx->devname, name, sizeof(ctx->devname));
+ ctx->headv = (unsigned long)head;
+ ctx->hook = hooknum;
+
+ cb->seq = 1;
+ cb->data = ctx;
+
+ return 0;
+}
+
+static int nfnl_hook_dump_stop(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
+static int nfnl_hook_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ if (!nla[NFNLA_HOOK_HOOKNUM])
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nfnl_hook_dump_start,
+ .done = nfnl_hook_dump_stop,
+ .dump = nfnl_hook_dump,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = {
+ [NFNL_MSG_HOOK_GET] = {
+ .call = nfnl_hook_get,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFNLA_HOOK_MAX,
+ .policy = nfnl_hook_nla_policy
+ },
+};
+
+static const struct nfnetlink_subsystem nfhook_subsys = {
+ .name = "nfhook",
+ .subsys_id = NFNL_SUBSYS_HOOK,
+ .cb_count = NFNL_MSG_HOOK_MAX,
+ .cb = nfnl_hook_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK);
+
+static int __init nfnetlink_hook_init(void)
+{
+ return nfnetlink_subsys_register(&nfhook_subsys);
+}
+
+static void __exit nfnetlink_hook_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfhook_subsys);
+}
+
+module_init(nfnetlink_hook_init);
+module_exit(nfnetlink_hook_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks");
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 0ba020ca38e6..bfcb9cd335bf 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -43,6 +43,10 @@
#include "../bridge/br_private.h"
#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
#define NFULNL_COPY_DISABLED 0xff
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
@@ -62,6 +66,7 @@ struct nfulnl_instance {
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
struct net *net;
+ netns_tracker ns_tracker;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
u32 peer_portid; /* PORTID of the peer process */
@@ -98,9 +103,9 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)
}
static struct nfulnl_instance *
-__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
+__instance_lookup(const struct nfnl_log_net *log, u16 group_num)
{
- struct hlist_head *head;
+ const struct hlist_head *head;
struct nfulnl_instance *inst;
head = &log->instance_table[instance_hashfn(group_num)];
@@ -118,15 +123,25 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
+instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num)
{
struct nfulnl_instance *inst;
- rcu_read_lock_bh();
inst = __instance_lookup(log, group_num);
if (inst && !refcount_inc_not_zero(&inst->use))
inst = NULL;
- rcu_read_unlock_bh();
+
+ return inst;
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(const struct nfnl_log_net *log, u16 group_num)
+{
+ struct nfulnl_instance *inst;
+
+ rcu_read_lock();
+ inst = instance_lookup_get_rcu(log, group_num);
+ rcu_read_unlock();
return inst;
}
@@ -136,7 +151,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head)
struct nfulnl_instance *inst =
container_of(head, struct nfulnl_instance, rcu);
- put_net(inst->net);
+ put_net_track(inst->net, &inst->ns_tracker);
kfree(inst);
module_put(THIS_MODULE);
}
@@ -183,7 +198,7 @@ instance_create(struct net *net, u_int16_t group_num,
timer_setup(&inst->timer, nfulnl_timer, 0);
- inst->net = get_net(net);
+ inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -356,8 +371,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
goto out;
}
}
- nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid);
out:
inst->qlen = 0;
inst->skb = NULL;
@@ -367,7 +381,7 @@ static void
__nfulnl_flush(struct nfulnl_instance *inst)
{
/* timer holds a reference */
- if (del_timer(&inst->timer))
+ if (timer_delete(&inst->timer))
instance_put(inst);
if (inst->skb)
__nfulnl_send(inst);
@@ -376,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst)
static void
nfulnl_timer(struct timer_list *t)
{
- struct nfulnl_instance *inst = from_timer(inst, t, timer);
+ struct nfulnl_instance *inst = timer_container_of(inst, t, timer);
spin_lock_bh(&inst->lock);
if (inst->skb)
@@ -453,20 +467,15 @@ __build_packet_message(struct nfnl_log_net *log,
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
sk_buff_data_t old_tail = inst->skb->tail;
struct sock *sk;
const unsigned char *hwhdrp;
- nlh = nlmsg_put(inst->skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(inst->skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
+ 0, pf, NFNETLINK_V0, htons(inst->group_num));
if (!nlh)
return -1;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(inst->group_num);
memset(&pmsg, 0, sizeof(pmsg));
pmsg.hw_protocol = skb->protocol;
@@ -499,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
} else {
- struct net_device *physindev;
+ int physinif;
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -507,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(indev->ifindex)))
goto nla_put_failure;
- physindev = nf_bridge_get_physindev(skb);
- if (physindev &&
+ physinif = nf_bridge_get_physinif(skb);
+ if (physinif &&
nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(physindev->ifindex)))
+ htonl(physinif)))
goto nla_put_failure;
}
#endif
@@ -558,7 +567,8 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
if (indev && skb->dev &&
- skb->mac_header != skb->network_header) {
+ skb_mac_header_was_set(skb) &&
+ skb_mac_header_len(skb) != 0) {
struct nfulnl_msg_packet_hw phw;
int len;
@@ -588,9 +598,9 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (hooknum <= NF_INET_FORWARD && skb->tstamp) {
+ if (hooknum <= NF_INET_FORWARD) {
+ struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true));
struct nfulnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -688,15 +698,15 @@ nfulnl_log_packet(struct net *net,
unsigned int plen = 0;
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
+ enum ip_conntrack_info ctinfo = 0;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(log, li->u.ulog.group);
+ inst = instance_lookup_get_rcu(log, li->u.ulog.group);
if (!inst)
return;
@@ -734,14 +744,16 @@ nfulnl_log_packet(struct net *net,
size += nla_total_size(sizeof(u_int32_t));
if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
size += nla_total_size(sizeof(u_int32_t));
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(skb, &ctinfo);
+ ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
size += nfulnl_get_bridge_size(skb);
@@ -845,10 +857,8 @@ static struct notifier_block nfulnl_rtnl_notifier = {
.notifier_call = nfulnl_rcv_nl_event,
};
-static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
return -ENOTSUPP;
}
@@ -869,29 +879,26 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
[NFULA_CFG_FLAGS] = { .type = NLA_U16 },
};
-static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfula[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t group_num = ntohs(nfmsg->res_id);
- struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(info->net);
+ u_int16_t group_num = ntohs(info->nfmsg->res_id);
struct nfulnl_msg_config_cmd *cmd = NULL;
- struct nfnl_log_net *log = nfnl_log_pernet(net);
- int ret = 0;
+ struct nfulnl_instance *inst;
u16 flags = 0;
+ int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
- u_int8_t pf = nfmsg->nfgen_family;
+ u_int8_t pf = info->nfmsg->nfgen_family;
cmd = nla_data(nfula[NFULA_CFG_CMD]);
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(net, pf, &nfulnl_logger);
+ return nf_log_bind_pf(info->net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(net, pf);
+ nf_log_unbind_pf(info->net, pf);
return 0;
}
}
@@ -932,7 +939,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
goto out_put;
}
- inst = instance_create(net, group_num,
+ inst = instance_create(info->net, group_num,
NETLINK_CB(skb).portid,
sk_user_ns(NETLINK_CB(skb).sk));
if (IS_ERR(inst)) {
@@ -993,11 +1000,17 @@ out:
}
static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
- [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
- .attr_count = NFULA_MAX, },
- [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX,
- .policy = nfula_cfg_policy },
+ [NFULNL_MSG_PACKET] = {
+ .call = nfulnl_recv_unsupp,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_MAX,
+ },
+ [NFULNL_MSG_CONFIG] = {
+ .call = nfulnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_CFG_MAX,
+ .policy = nfula_cfg_policy
+ },
};
static const struct nfnetlink_subsystem nfulnl_subsys = {
@@ -1025,7 +1038,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
struct hlist_head *head = &log->instance_table[st->bucket];
if (!hlist_empty(head))
- return rcu_dereference_bh(hlist_first_rcu(head));
+ return rcu_dereference(hlist_first_rcu(head));
}
return NULL;
}
@@ -1033,7 +1046,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
static struct hlist_node *get_next(struct net *net, struct iter_state *st,
struct hlist_node *h)
{
- h = rcu_dereference_bh(hlist_next_rcu(h));
+ h = rcu_dereference(hlist_next_rcu(h));
while (!h) {
struct nfnl_log_net *log;
struct hlist_head *head;
@@ -1043,7 +1056,7 @@ static struct hlist_node *get_next(struct net *net, struct iter_state *st,
log = nfnl_log_pernet(net);
head = &log->instance_table[st->bucket];
- h = rcu_dereference_bh(hlist_first_rcu(head));
+ h = rcu_dereference(hlist_first_rcu(head));
}
return h;
}
@@ -1061,9 +1074,9 @@ static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
}
static void *seq_start(struct seq_file *s, loff_t *pos)
- __acquires(rcu_bh)
+ __acquires(rcu)
{
- rcu_read_lock_bh();
+ rcu_read_lock();
return get_idx(seq_file_net(s), s->private, *pos);
}
@@ -1074,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(rcu_bh)
+ __releases(rcu)
{
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
static int seq_show(struct seq_file *s, void *v)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 916a3c7f9eaf..c0fc431991e8 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
sizeof(struct tcphdr), ctx->optsize, opts);
+ if (!ctx->optp)
+ return NULL;
}
return tcp;
@@ -267,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb,
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
struct tcphdr _tcph;
+ bool found = false;
memset(&ctx, 0, sizeof(ctx));
@@ -281,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb,
data->genre = f->genre;
data->version = f->version;
+ found = true;
break;
}
- return true;
+ return found;
}
EXPORT_SYMBOL_GPL(nf_osf_find);
@@ -292,10 +296,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
[OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
};
-static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_add_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *kf = NULL, *sf;
@@ -307,11 +310,19 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
if (!osf_attrs[OSF_ATTR_FINGER])
return -EINVAL;
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -EINVAL;
f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+ if (f->opt_num > ARRAY_SIZE(f->opt))
+ return -EINVAL;
+
+ if (!memchr(f->genre, 0, MAXGENRELEN) ||
+ !memchr(f->subtype, 0, MAXGENRELEN) ||
+ !memchr(f->version, 0, MAXGENRELEN))
+ return -EINVAL;
+
kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
if (!kf)
return -ENOMEM;
@@ -325,7 +336,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
kfree(kf);
kf = NULL;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
err = -EEXIST;
break;
}
@@ -339,11 +350,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
return err;
}
-static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_remove_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *sf;
@@ -377,11 +386,13 @@ static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
[OSF_MSG_ADD] = {
.call = nfnl_osf_add_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
[OSF_MSG_REMOVE] = {
.call = nfnl_osf_remove_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
@@ -436,3 +447,5 @@ module_init(nfnl_osf_init);
module_exit(nfnl_osf_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Passive OS fingerprint matching");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 3243a31f6e82..8b7b39d8a109 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -29,6 +29,8 @@
#include <linux/netfilter/nfnetlink_queue.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
+#include <linux/cgroup-defs.h>
+#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
@@ -167,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head)
struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
rcu);
+ rcu_read_lock();
nfqnl_flush(inst, NULL, 0);
+ rcu_read_unlock();
kfree(inst);
module_put(THIS_MODULE);
}
@@ -223,22 +227,174 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
return entry;
}
-static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+static unsigned int nf_iterate(struct sk_buff *skb,
+ struct nf_hook_state *state,
+ const struct nf_hook_entries *hooks,
+ unsigned int *index)
+{
+ const struct nf_hook_entry *hook;
+ unsigned int verdict, i = *index;
+
+ while (i < hooks->num_hook_entries) {
+ hook = &hooks->hooks[i];
+repeat:
+ verdict = nf_hook_entry_hookfn(hook, skb, state);
+ if (verdict != NF_ACCEPT) {
+ *index = i;
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ i++;
+ }
+
+ *index = i;
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
+{
+ switch (pf) {
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ case NFPROTO_BRIDGE:
+ return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+#endif
+ case NFPROTO_IPV4:
+ return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
+ case NFPROTO_IPV6:
+ return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+{
+#ifdef CONFIG_INET
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(entry->state.net, entry->state.sk,
+ skb, RTN_UNSPEC);
+ }
+#endif
+ return 0;
+}
+
+static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ const struct nf_ipv6_ops *v6ops;
+ int ret = 0;
+
+ switch (entry->state.pf) {
+ case AF_INET:
+ ret = nf_ip_reroute(skb, entry);
+ break;
+ case AF_INET6:
+ v6ops = rcu_dereference(nf_ipv6_ops);
+ if (v6ops)
+ ret = v6ops->reroute(skb, entry);
+ break;
+ }
+ return ret;
+}
+
+/* caller must hold rcu read-side lock */
+static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_hook_entry *hook_entry;
+ const struct nf_hook_entries *hooks;
+ struct sk_buff *skb = entry->skb;
+ const struct net *net;
+ unsigned int i;
int err;
+ u8 pf;
+
+ net = entry->state.net;
+ pf = entry->state.pf;
+
+ hooks = nf_hook_entries_head(net, pf, entry->state.hook);
+
+ i = entry->hook_index;
+ if (!hooks || i >= hooks->num_hook_entries) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
+ nf_queue_entry_free(entry);
+ return;
+ }
+
+ hook_entry = &hooks->hooks[i];
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT)
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
+
+ if (verdict == NF_ACCEPT) {
+ if (nf_reroute(skb, entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (verdict == NF_ACCEPT) {
+next_hook:
+ ++i;
+ verdict = nf_iterate(skb, &entry->state, hooks, &i);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_STOP:
+ local_bh_disable();
+ entry->state.okfn(entry->state.net, entry->state.sk, skb);
+ local_bh_enable();
+ break;
+ case NF_QUEUE:
+ err = nf_queue(skb, &entry->state, i, verdict);
+ if (err == 1)
+ goto next_hook;
+ break;
+ case NF_STOLEN:
+ break;
+ default:
+ kfree_skb(skb);
+ }
+
+ nf_queue_entry_free(entry);
+}
+
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ const struct nf_ct_hook *ct_hook;
if (verdict == NF_ACCEPT ||
verdict == NF_REPEAT ||
verdict == NF_STOP) {
+ unsigned int ct_verdict = verdict;
+
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- if (ct_hook) {
- err = ct_hook->update(entry->state.net, entry->skb);
- if (err < 0)
- verdict = NF_DROP;
- }
+ if (ct_hook)
+ ct_verdict = ct_hook->update(entry->state.net, entry->skb);
rcu_read_unlock();
+
+ switch (ct_verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ /* follow userspace verdict, could be REPEAT */
+ break;
+ case NF_STOLEN:
+ nf_queue_entry_free(entry);
+ return;
+ default:
+ verdict = ct_verdict & NF_VERDICT_MASK;
+ break;
+ }
}
nf_reinject(entry, verdict);
}
@@ -301,18 +457,31 @@ nla_put_failure:
return -1;
}
-static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ if (sk && sk_fullsock(sk)) {
+ u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+
+ if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
{
- u32 seclen = 0;
+ int seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+
if (!skb || !sk_fullsock(skb->sk))
return 0;
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->secmark)
- security_secid_to_secctx(skb->secmark, secdata, &seclen);
-
+ seclen = security_secid_to_secctx(skb->secmark, ctx);
read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
return seclen;
@@ -371,6 +540,14 @@ nla_put_failure:
return -1;
}
+static int nf_queue_checksum_help(struct sk_buff *entskb)
+{
+ if (skb_csum_is_sctp(entskb))
+ return skb_crc32c_csum_help(entskb);
+
+ return skb_checksum_help(entskb);
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
@@ -383,16 +560,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *entskb = entry->skb;
struct net_device *indev;
struct net_device *outdev;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
+ enum ip_conntrack_info ctinfo = 0;
+ const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
- char *secdata = NULL;
- u32 seclen = 0;
+ struct lsm_context ctx = { NULL, 0, 0 };
+ int seclen = 0;
+ ktime_t tstamp;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
@@ -403,11 +580,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ + nla_total_size(sizeof(u_int32_t)) /* classid */
+#endif
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp)
+ tstamp = skb_tstamp_cond(entskb, false);
+ if (tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -428,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
case NFQNL_COPY_PACKET:
if (!(queue->flags & NFQA_CFG_F_GSO) &&
entskb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(entskb))
+ nf_queue_checksum_help(entskb))
return NULL;
data_len = READ_ONCE(queue->copy_range);
@@ -444,13 +626,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfnl_ct = rcu_dereference(nfnl_ct_hook);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (queue->flags & NFQA_CFG_F_CONNTRACK) {
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(entskb, &ctinfo);
+ ct = nf_ct_get(entskb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
if (queue->flags & NFQA_CFG_F_UID_GID) {
size += (nla_total_size(sizeof(u_int32_t)) /* uid */
@@ -458,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
- seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ seclen = nfqnl_get_sk_secctx(entskb, &ctx);
+ if (seclen < 0)
+ return NULL;
if (seclen)
size += nla_total_size(seclen);
}
@@ -469,18 +655,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
goto nlmsg_failure;
}
- nlh = nlmsg_put(skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
+ 0, entry->state.pf, NFNETLINK_V0,
+ htons(queue->queue_num));
if (!nlh) {
skb_tx_error(entskb);
kfree_skb(skb);
goto nlmsg_failure;
}
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = entry->state.pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(queue->queue_num);
nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
pmsg = nla_data(nla);
@@ -561,8 +744,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
goto nla_put_failure;
+ if (entskb->priority &&
+ nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+ goto nla_put_failure;
+
if (indev && entskb->dev &&
- entskb->mac_header != entskb->network_header) {
+ skb_mac_header_was_set(entskb) &&
+ skb_mac_header_len(entskb) != 0) {
struct nfqnl_msg_packet_hw phw;
int len;
@@ -578,9 +766,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) {
+ if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -593,7 +781,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context))
goto nla_put_failure;
if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
@@ -621,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
nlh->nlmsg_len = skb->len;
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return skb;
nla_put_failure:
@@ -630,8 +821,8 @@ nla_put_failure:
kfree_skb(skb);
net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return NULL;
}
@@ -639,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- const struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ unsigned long status;
+ unsigned int use;
+
+ if (!ct)
+ return false;
- if (ct && ((ct->status & flags) == IPS_DYING))
+ status = READ_ONCE(ct->status);
+ if ((status & flags) == IPS_DYING)
return true;
+
+ if (status & IPS_CONFIRMED)
+ return false;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * We can't be solved with serialization here because one clone could
+ * have been queued for local delivery.
+ */
+ use = refcount_read(&ct->ct_general.use);
+ if (likely(use == 1))
+ return false;
+
+ /* Can't decrement further? Exclusive ownership. */
+ if (!refcount_dec_not_one(&ct->ct_general.use))
+ return false;
+
+ skb_set_nfct(entry->skb, 0);
+ /* No nf_ct_put(): we already decremented .use and it cannot
+ * drop down to 0.
+ */
+ return true;
#endif
return false;
}
@@ -681,7 +903,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
*packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid);
if (err < 0) {
if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
failopen = 1;
@@ -711,9 +933,15 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_entry_get_refs(entry);
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -796,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
break;
}
- if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
return __nfqnl_enqueue_packet(net, queue, entry);
nf_bridge_adjust_skb_data(skb);
@@ -831,11 +1059,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
@@ -953,6 +1186,16 @@ static void nfqnl_nf_hook_drop(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
+ /* This function is also called on net namespace error unwind,
+ * when pernet_ops->init() failed and ->exit() functions of the
+ * previous pernet_ops gets called.
+ *
+ * This may result in a call to nfqnl_nf_hook_drop() before
+ * struct nfnl_queue_net was allocated.
+ */
+ if (!q)
+ return;
+
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;
struct hlist_head *head = &q->instance_table[i];
@@ -1005,11 +1248,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
[NFQA_VLAN] = { .type = NLA_NESTED },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static struct nfqnl_instance *
@@ -1048,20 +1293,17 @@ static int nfq_id_after(unsigned int id, unsigned int max)
return (int)(id - max) > 0;
}
-static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u16 queue_num = ntohs(info->nfmsg->res_id);
struct nf_queue_entry *entry, *tmp;
- unsigned int verdict, maxid;
struct nfqnl_msg_verdict_hdr *vhdr;
struct nfqnl_instance *queue;
+ unsigned int verdict, maxid;
LIST_HEAD(batch_list);
- u16 queue_num = ntohs(nfmsg->res_id);
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
queue = verdict_instance_lookup(q, queue_num,
NETLINK_CB(skb).portid);
@@ -1093,20 +1335,24 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
}
return 0;
}
-static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
const struct nlmsghdr *nlh,
const struct nlattr * const nfqa[],
struct nf_queue_entry *entry,
enum ip_conntrack_info *ctinfo)
{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct nf_conn *ct;
- ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+ ct = nf_ct_get(entry->skb, ctinfo);
if (ct == NULL)
return NULL;
@@ -1118,6 +1364,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
NETLINK_CB(entry->skb).portid,
nlmsg_report(nlh));
return ct;
+#else
+ return NULL;
+#endif
}
static int nfqa_parse_bridge(struct nf_queue_entry *entry,
@@ -1156,22 +1405,18 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
return 0;
}
-static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+ const struct nfnl_ct_hook *nfnl_ct;
struct nfqnl_msg_verdict_hdr *vhdr;
+ enum ip_conntrack_info ctinfo;
struct nfqnl_instance *queue;
- unsigned int verdict;
struct nf_queue_entry *entry;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ unsigned int verdict;
int err;
queue = verdict_instance_lookup(q, queue_num,
@@ -1194,7 +1439,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_CT]) {
if (nfnl_ct != NULL)
- ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
+ ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
+ &ctinfo);
}
if (entry->state.pf == PF_BRIDGE) {
@@ -1218,14 +1464,15 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
return 0;
}
-static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
return -ENOTSUPP;
}
@@ -1243,16 +1490,13 @@ static const struct nf_queue_handler nfqh = {
.nf_hook_drop = nfqnl_nf_hook_drop,
};
-static int nfqnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
- struct nfqnl_instance *queue;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
struct nfqnl_msg_config_cmd *cmd = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ struct nfqnl_instance *queue;
__u32 flags = 0, mask = 0;
int ret = 0;
@@ -1371,17 +1615,29 @@ err_out_unlock:
}
static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
- [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_policy },
- [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .policy = nfqa_cfg_policy },
- [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_batch_policy },
+ [NFQNL_MSG_PACKET] = {
+ .call = nfqnl_recv_unsupp,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ },
+ [NFQNL_MSG_VERDICT] = {
+ .call = nfqnl_recv_verdict,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy
+ },
+ [NFQNL_MSG_CONFIG] = {
+ .call = nfqnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy
+ },
+ [NFQNL_MSG_VERDICT_BATCH] = {
+ .call = nfqnl_recv_verdict_batch,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy
+ },
};
static const struct nfnetlink_subsystem nfqnl_subsys = {
@@ -1499,7 +1755,6 @@ static int __net_init nfnl_queue_net_init(struct net *net)
&nfqnl_seq_ops, sizeof(struct iter_state)))
return -ENOMEM;
#endif
- nf_register_queue_handler(net, &nfqh);
return 0;
}
@@ -1508,7 +1763,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
unsigned int i;
- nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
@@ -1516,15 +1770,9 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}
-static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
-{
- synchronize_rcu();
-}
-
static struct pernet_operations nfnl_queue_net_ops = {
.init = nfnl_queue_net_init,
.exit = nfnl_queue_net_exit,
- .exit_batch = nfnl_queue_net_exit_batch,
.id = &nfnl_queue_net_id,
.size = sizeof(struct nfnl_queue_net),
};
@@ -1552,6 +1800,8 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_subsys;
}
+ nf_register_queue_handler(&nfqh);
+
return status;
cleanup_netlink_subsys:
@@ -1565,6 +1815,7 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
+ nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index bc37d6c59db4..d550910aabec 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -16,8 +16,9 @@
#include <net/netfilter/nf_tables_offload.h>
struct nft_bitwise {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 sreg2;
+ u8 dreg;
enum nft_bitwise_ops op:8;
u8 len;
struct nft_data mask;
@@ -25,12 +26,12 @@ struct nft_bitwise {
struct nft_data data;
};
-static void nft_bitwise_eval_bool(u32 *dst, const u32 *src,
- const struct nft_bitwise *priv)
+static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
{
unsigned int i;
- for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++)
dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
}
@@ -60,101 +61,184 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
}
}
+static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] & src2[i];
+}
+
+static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] | src2[i];
+}
+
+static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] ^ src2[i];
+}
+
void nft_bitwise_eval(const struct nft_expr *expr,
struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
- const u32 *src = &regs->data[priv->sreg];
+ const u32 *src = &regs->data[priv->sreg], *src2;
u32 *dst = &regs->data[priv->dreg];
- switch (priv->op) {
- case NFT_BITWISE_BOOL:
- nft_bitwise_eval_bool(dst, src, priv);
- break;
- case NFT_BITWISE_LSHIFT:
+ if (priv->op == NFT_BITWISE_MASK_XOR) {
+ nft_bitwise_eval_mask_xor(dst, src, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_LSHIFT) {
nft_bitwise_eval_lshift(dst, src, priv);
- break;
- case NFT_BITWISE_RSHIFT:
+ return;
+ }
+ if (priv->op == NFT_BITWISE_RSHIFT) {
nft_bitwise_eval_rshift(dst, src, priv);
- break;
+ return;
+ }
+
+ src2 = priv->sreg2 ? &regs->data[priv->sreg2] : priv->data.data;
+
+ if (priv->op == NFT_BITWISE_AND) {
+ nft_bitwise_eval_and(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_OR) {
+ nft_bitwise_eval_or(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_XOR) {
+ nft_bitwise_eval_xor(dst, src, src2, priv);
+ return;
}
}
static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
[NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_SREG2] = { .type = NLA_U32 },
[NFTA_BITWISE_DREG] = { .type = NLA_U32 },
[NFTA_BITWISE_LEN] = { .type = NLA_U32 },
[NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
[NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
- [NFTA_BITWISE_OP] = { .type = NLA_U32 },
+ [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_BITWISE_DATA] = { .type = NLA_NESTED },
};
-static int nft_bitwise_init_bool(struct nft_bitwise *priv,
- const struct nlattr *const tb[])
+static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
{
- struct nft_data_desc mask, xor;
+ struct nft_data_desc mask = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->mask),
+ .len = priv->len,
+ };
+ struct nft_data_desc xor = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->xor),
+ .len = priv->len,
+ };
int err;
- if (tb[NFTA_BITWISE_DATA])
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
return -EINVAL;
if (!tb[NFTA_BITWISE_MASK] ||
!tb[NFTA_BITWISE_XOR])
return -EINVAL;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask,
- tb[NFTA_BITWISE_MASK]);
+ err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) {
- err = -EINVAL;
- goto err1;
- }
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor,
- tb[NFTA_BITWISE_XOR]);
+ err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]);
if (err < 0)
- goto err1;
- if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) {
- err = -EINVAL;
- goto err2;
- }
+ goto err_xor_err;
return 0;
-err2:
- nft_data_release(&priv->xor, xor.type);
-err1:
+
+err_xor_err:
nft_data_release(&priv->mask, mask.type);
+
return err;
}
static int nft_bitwise_init_shift(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = sizeof(u32),
+ };
int err;
if (tb[NFTA_BITWISE_MASK] ||
- tb[NFTA_BITWISE_XOR])
+ tb[NFTA_BITWISE_XOR] ||
+ tb[NFTA_BITWISE_SREG2])
return -EINVAL;
if (!tb[NFTA_BITWISE_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d,
- tb[NFTA_BITWISE_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]);
if (err < 0)
return err;
- if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) ||
- priv->data.data[0] >= BITS_PER_TYPE(u32)) {
- nft_data_release(&priv->data, d.type);
+
+ if (priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+ nft_data_release(&priv->data, desc.type);
return -EINVAL;
}
return 0;
}
+static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
+ struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ int err;
+
+ if (tb[NFTA_BITWISE_MASK] ||
+ tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) ||
+ (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2]))
+ return -EINVAL;
+
+ if (tb[NFTA_BITWISE_DATA]) {
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = priv->len,
+ };
+
+ err = nft_data_init(NULL, &priv->data, &desc,
+ tb[NFTA_BITWISE_DATA]);
+ if (err < 0)
+ return err;
+ } else {
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2],
+ &priv->sreg2, priv->len);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
static int nft_bitwise_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -163,57 +247,60 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
u32 len;
int err;
- if (!tb[NFTA_BITWISE_SREG] ||
- !tb[NFTA_BITWISE_DREG] ||
- !tb[NFTA_BITWISE_LEN])
- return -EINVAL;
-
err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
if (err < 0)
return err;
if (tb[NFTA_BITWISE_OP]) {
priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
switch (priv->op) {
- case NFT_BITWISE_BOOL:
+ case NFT_BITWISE_MASK_XOR:
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
break;
default:
return -EOPNOTSUPP;
}
} else {
- priv->op = NFT_BITWISE_BOOL;
+ priv->op = NFT_BITWISE_MASK_XOR;
}
switch(priv->op) {
- case NFT_BITWISE_BOOL:
- err = nft_bitwise_init_bool(priv, tb);
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_init_mask_xor(priv, tb);
break;
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
err = nft_bitwise_init_shift(priv, tb);
break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_init_bool(ctx, priv, tb);
+ break;
}
return err;
}
-static int nft_bitwise_dump_bool(struct sk_buff *skb,
- const struct nft_bitwise *priv)
+static int nft_bitwise_dump_mask_xor(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
{
if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
NFT_DATA_VALUE, priv->len) < 0)
@@ -235,7 +322,23 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
return 0;
}
-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump_bool(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (priv->sreg2) {
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2))
+ return -1;
+ } else {
+ if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
int err = 0;
@@ -250,13 +353,18 @@ static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
return -1;
switch (priv->op) {
- case NFT_BITWISE_BOOL:
- err = nft_bitwise_dump_bool(skb, priv);
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_dump_mask_xor(skb, priv);
break;
case NFT_BITWISE_LSHIFT:
case NFT_BITWISE_RSHIFT:
err = nft_bitwise_dump_shift(skb, priv);
break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_dump_bool(skb, priv);
+ break;
}
return err;
@@ -271,7 +379,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
const struct nft_bitwise *priv = nft_expr_priv(expr);
struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
- if (priv->op != NFT_BITWISE_BOOL)
+ if (priv->op != NFT_BITWISE_MASK_XOR)
return -EOPNOTSUPP;
if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) ||
@@ -283,19 +391,257 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_bitwise_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_bitwise *bitwise;
+ unsigned int regcount;
+ u8 dreg;
+ int i;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->sreg].num_reg == 0 &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->sreg2 == bitwise->sreg2 &&
+ priv->dreg == bitwise->dreg &&
+ priv->op == bitwise->op &&
+ priv->len == bitwise->len &&
+ !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) &&
+ !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) &&
+ !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise ||
+ track->regs[priv->sreg].num_reg != 0) {
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ nft_reg_track_update(track, track->regs[priv->sreg].selector,
+ priv->dreg, priv->len);
+ }
+
+ dreg = priv->dreg;
+ regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ track->regs[dreg].bitwise = expr;
+
+ return false;
+}
+
static const struct nft_expr_ops nft_bitwise_ops = {
.type = &nft_bitwise_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise)),
.eval = nft_bitwise_eval,
.init = nft_bitwise_init,
.dump = nft_bitwise_dump,
+ .reduce = nft_bitwise_reduce,
.offload = nft_bitwise_offload,
};
+static int
+nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
+{
+ struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ .len = sizeof(u32),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &data, &desc, tb);
+ if (err < 0)
+ return err;
+
+ *out = data.data[0];
+
+ return 0;
+}
+
+static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
+ sizeof(u32));
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_MASK], &priv->mask);
+ if (err < 0)
+ return err;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_XOR], &priv->xor);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int
+nft_bitwise_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
+ return -1;
+ if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR)))
+ return -1;
+
+ data.data[0] = priv->mask;
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ data.data[0] = priv->xor;
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->xor || priv->sreg != priv->dreg || reg->len != sizeof(u32))
+ return -EOPNOTSUPP;
+
+ reg->mask.data[0] = priv->mask;
+ return 0;
+}
+
+static bool nft_bitwise_fast_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ const struct nft_bitwise_fast_expr *bitwise;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->mask == bitwise->mask &&
+ priv->xor == bitwise->xor) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise) {
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ track->regs[priv->dreg].selector =
+ track->regs[priv->sreg].selector;
+ }
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
+const struct nft_expr_ops nft_bitwise_fast_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_bitwise_fast_init,
+ .dump = nft_bitwise_fast_dump,
+ .reduce = nft_bitwise_fast_reduce,
+ .offload = nft_bitwise_fast_offload,
+};
+
+static const struct nft_expr_ops *
+nft_bitwise_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ int err;
+ u32 len;
+
+ if (!tb[NFTA_BITWISE_LEN] ||
+ !tb[NFTA_BITWISE_SREG] ||
+ !tb[NFTA_BITWISE_DREG])
+ return ERR_PTR(-EINVAL);
+
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (len != sizeof(u32))
+ return &nft_bitwise_ops;
+
+ if (tb[NFTA_BITWISE_OP] &&
+ ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR)
+ return &nft_bitwise_ops;
+
+ return &nft_bitwise_fast_ops;
+}
+
struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
- .ops = &nft_bitwise_ops,
+ .select_ops = nft_bitwise_select_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
+
+bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_expr *last = track->last;
+ const struct nft_expr *next;
+
+ if (expr == last)
+ return false;
+
+ next = nft_expr_next(expr);
+ if (next->ops == &nft_bitwise_ops)
+ return nft_bitwise_reduce(track, next);
+ else if (next->ops == &nft_bitwise_fast_ops)
+ return nft_bitwise_fast_reduce(track, next);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise);
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 12bed3f7bbc6..af9206a3afd1 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -5,7 +5,7 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -16,8 +16,8 @@
#include <net/netfilter/nf_tables.h>
struct nft_byteorder {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
enum nft_byteorder_ops op:8;
u8 len;
u8 size;
@@ -30,28 +30,30 @@ void nft_byteorder_eval(const struct nft_expr *expr,
const struct nft_byteorder *priv = nft_expr_priv(expr);
u32 *src = &regs->data[priv->sreg];
u32 *dst = &regs->data[priv->dreg];
- union { u32 u32; u16 u16; } *s, *d;
+ u16 *s16, *d16;
unsigned int i;
- s = (void *)src;
- d = (void *)dst;
+ s16 = (void *)src;
+ d16 = (void *)dst;
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst[i], be64_to_cpu(src64));
+ nft_reg_store64(&dst64[i],
+ be64_to_cpu((__force __be64)src64));
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = (__force __u64)
cpu_to_be64(nft_reg_load64(&src[i]));
- nft_reg_store64(&dst[i], src64);
+ nft_reg_store64(&dst64[i], src64);
}
break;
}
@@ -61,11 +63,11 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = ntohl((__force __be32)s[i].u32);
+ dst[i] = ntohl((__force __be32)src[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = (__force __u32)htonl(s[i].u32);
+ dst[i] = (__force __u32)htonl(src[i]);
break;
}
break;
@@ -73,11 +75,11 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = ntohs((__force __be16)s[i].u16);
+ d16[i] = ntohs((__force __be16)s16[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = (__force __u16)htons(s[i].u16);
+ d16[i] = (__force __u16)htons(s16[i]);
break;
}
break;
@@ -87,9 +89,9 @@ void nft_byteorder_eval(const struct nft_expr *expr,
static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
[NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
[NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_byteorder_init(const struct nft_ctx *ctx,
@@ -131,23 +133,24 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
@@ -167,12 +170,23 @@ nla_put_failure:
return -1;
}
+static bool nft_byteorder_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
.eval = nft_byteorder_eval,
.init = nft_byteorder_init,
.dump = nft_byteorder_dump,
+ .reduce = nft_byteorder_reduce,
};
struct nft_expr_type nft_byteorder_type __read_mostly = {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index c78d01bc02e9..b16185e9a6dd 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
switch (state->pf) {
case NFPROTO_IPV4:
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
break;
case NFPROTO_IPV6:
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
break;
default:
break;
@@ -161,16 +161,49 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
return nft_do_chain(&pkt, priv);
}
+static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state ingress_state = *state;
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
+ ingress_state.pf = NFPROTO_IPV4;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ case htons(ETH_P_IPV6):
+ ingress_state.pf = NFPROTO_IPV6;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
static const struct nft_chain_type nft_chain_filter_inet = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_INET,
- .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ .hook_mask = (1 << NF_INET_INGRESS) |
+ (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING),
.hooks = {
+ [NF_INET_INGRESS] = nft_do_chain_inet_ingress,
[NF_INET_LOCAL_IN] = nft_do_chain_inet,
[NF_INET_LOCAL_OUT] = nft_do_chain_inet,
[NF_INET_FORWARD] = nft_do_chain_inet,
@@ -205,13 +238,13 @@ nft_do_chain_bridge(void *priv,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -260,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -277,79 +310,122 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_NETDEV,
- .hook_mask = (1 << NF_NETDEV_INGRESS),
+ .hook_mask = (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS),
.hooks = {
[NF_NETDEV_INGRESS] = nft_do_chain_netdev,
+ [NF_NETDEV_EGRESS] = nft_do_chain_netdev,
},
};
-static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_ctx *ctx)
+static int nft_netdev_event(unsigned long event, struct net_device *dev,
+ struct nft_base_chain *basechain, bool changename)
{
- struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
- struct nft_hook *hook, *found = NULL;
- int n = 0;
-
- if (event != NETDEV_UNREGISTER)
- return;
+ struct nft_table *table = basechain->chain.table;
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ bool match;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev == dev)
- found = hook;
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
- n++;
- }
- if (!found)
- return;
-
- if (n > 1) {
- nf_unregister_net_hook(ctx->net, &found->ops);
- list_del_rcu(&found->list);
- kfree_rcu(found, rcu);
- return;
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), ops);
+
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kmemdup(&basechain->ops,
+ sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->dev = dev;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nf_register_net_hook(dev_net(dev), ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
+ break;
}
+ return 0;
+}
+
+static int __nf_tables_netdev_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nft_base_chain *basechain;
+ struct nftables_pernet *nft_net;
+ struct nft_chain *chain;
+ struct nft_table *table;
+
+ nft_net = nft_pernet(dev_net(dev));
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (table->family != NFPROTO_NETDEV &&
+ table->family != NFPROTO_INET)
+ continue;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain))
+ continue;
- /* UNREGISTER events are also happening on netns exit.
- *
- * Although nf_tables core releases all tables/chains, only this event
- * handler provides guarantee that hook->ops.dev is still accessible,
- * so we cannot skip exiting net namespaces.
- */
- __nft_release_basechain(ctx);
+ basechain = nft_base_chain(chain);
+ if (table->family == NFPROTO_INET &&
+ basechain->ops.hooknum != NF_INET_INGRESS)
+ continue;
+
+ if (nft_netdev_event(event, dev, basechain, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct nft_table *table;
- struct nft_chain *chain, *nr;
- struct nft_ctx ctx = {
- .net = dev_net(dev),
- };
+ struct nftables_pernet *nft_net;
+ int ret = NOTIFY_DONE;
- if (event != NETDEV_UNREGISTER &&
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
event != NETDEV_CHANGENAME)
return NOTIFY_DONE;
- mutex_lock(&ctx.net->nft.commit_mutex);
- list_for_each_entry(table, &ctx.net->nft.tables, list) {
- if (table->family != NFPROTO_NETDEV)
- continue;
+ nft_net = nft_pernet(dev_net(dev));
+ mutex_lock(&nft_net->commit_mutex);
- ctx.family = table->family;
- ctx.table = table;
- list_for_each_entry_safe(chain, nr, &table->chains, list) {
- if (!nft_is_base_chain(chain))
- continue;
-
- ctx.chain = chain;
- nft_netdev_event(event, dev, &ctx);
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_netdev_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
- mutex_unlock(&ctx.net->nft.commit_mutex);
-
- return NOTIFY_DONE;
+out_unlock:
+ mutex_unlock(&nft_net->commit_mutex);
+ return ret;
}
static struct notifier_block nf_tables_netdev_notifier = {
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
index eac4a901233f..40e230d8b712 100644
--- a/net/netfilter/nft_chain_nat.c
+++ b/net/netfilter/nft_chain_nat.c
@@ -17,12 +17,12 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb,
switch (state->pf) {
#ifdef CONFIG_NF_TABLES_IPV4
case NFPROTO_IPV4:
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
break;
#endif
#ifdef CONFIG_NF_TABLES_IPV6
case NFPROTO_IPV6:
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
break;
#endif
default:
@@ -137,6 +137,7 @@ module_init(nft_chain_nat_init);
module_exit(nft_chain_nat_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("nftables network address translation support");
#ifdef CONFIG_NF_TABLES_IPV4
MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
#endif
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
index 8826bbe71136..925db0dce48d 100644
--- a/net/netfilter/nft_chain_route.c
+++ b/net/netfilter/nft_chain_route.c
@@ -26,7 +26,7 @@ static unsigned int nf_route_table_hook4(void *priv,
u8 tos;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
mark = skb->mark;
iph = ip_hdr(skb);
@@ -42,7 +42,7 @@ static unsigned int nf_route_table_hook4(void *priv,
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -74,7 +74,7 @@ static unsigned int nf_route_table_hook6(void *priv,
int err;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
/* save source/dest address, mark, hoplimit, flowlabel, priority */
memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
@@ -92,7 +92,7 @@ static unsigned int nf_route_table_hook6(void *priv,
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u32 *)ipv6_hdr(skb)))) {
- err = nf_ip6_route_me_harder(state->net, skb);
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 8a28c127effc..2605f43737bc 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -18,7 +18,7 @@
struct nft_cmp_expr {
struct nft_data data;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_cmp_ops op:8;
};
@@ -43,7 +43,7 @@ void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_LT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_LTE:
if (d > 0)
goto mismatch;
@@ -51,7 +51,7 @@ void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_GT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_GTE:
if (d < 0)
goto mismatch;
@@ -73,22 +73,17 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
int err;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- nft_data_release(&priv->data, desc.type);
- return err;
- }
-
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -97,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);
@@ -115,21 +111,58 @@ nla_put_failure:
return -1;
}
+union nft_cmp_offload_data {
+ u16 val16;
+ u32 val32;
+ u64 val64;
+};
+
+static void nft_payload_n2h(union nft_cmp_offload_data *data,
+ const u8 *val, u32 len)
+{
+ switch (len) {
+ case 2:
+ data->val16 = ntohs(*((__be16 *)val));
+ break;
+ case 4:
+ data->val32 = ntohl(*((__be32 *)val));
+ break;
+ case 8:
+ data->val64 = be64_to_cpu(*((__be64 *)val));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_cmp_expr *priv)
{
struct nft_offload_reg *reg = &ctx->regs[priv->sreg];
+ union nft_cmp_offload_data _data, _datamask;
u8 *mask = (u8 *)&flow->match.mask;
u8 *key = (u8 *)&flow->match.key;
+ u8 *data, *datamask;
- if (priv->op != NFT_CMP_EQ || reg->len != priv->len)
+ if (priv->op != NFT_CMP_EQ || priv->len > reg->len)
return -EOPNOTSUPP;
- memcpy(key + reg->offset, &priv->data, priv->len);
- memcpy(mask + reg->offset, &reg->mask, priv->len);
+ if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) {
+ nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len);
+ nft_payload_n2h(&_datamask, (u8 *)&reg->mask, reg->len);
+ data = (u8 *)&_data;
+ datamask = (u8 *)&_datamask;
+ } else {
+ data = (u8 *)&priv->data;
+ datamask = (u8 *)&reg->mask;
+ }
+
+ memcpy(key + reg->offset, data, reg->len);
+ memcpy(mask + reg->offset, datamask, reg->len);
- flow->match.dissector.used_keys |= BIT(reg->key);
+ flow->match.dissector.used_keys |= BIT_ULL(reg->key);
flow->match.dissector.offset[reg->key] = reg->base_offset;
if (reg->key == FLOW_DISSECTOR_KEY_META &&
@@ -137,7 +170,7 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
return -EOPNOTSUPP;
- nft_offload_update_dependency(ctx, &priv->data, priv->len);
+ nft_offload_update_dependency(ctx, &priv->data, reg->len);
return 0;
}
@@ -157,34 +190,48 @@ static const struct nft_expr_ops nft_cmp_ops = {
.eval = nft_cmp_eval,
.init = nft_cmp_init,
.dump = nft_cmp_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_offload,
};
+/* Calculate the mask for the nft_cmp_fast expression. On big endian the
+ * mask needs to include the *upper* bytes when interpreting that data as
+ * something smaller than the full u32, therefore a cpu_to_le32 is done.
+ */
+static u32 nft_cmp_fast_mask(unsigned int len)
+{
+ __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
+ data) * BITS_PER_BYTE - len));
+
+ return (__force u32)mask;
+}
+
static int nft_cmp_fast_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
struct nft_data data;
- u32 mask;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
int err;
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
desc.len *= BITS_PER_BYTE;
- mask = nft_cmp_fast_mask(desc.len);
- priv->data = data.data[0] & mask;
+ priv->mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & priv->mask;
priv->len = desc.len;
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
return 0;
}
@@ -201,20 +248,22 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
},
.sreg = priv->sreg,
.len = priv->len / BITS_PER_BYTE,
- .op = NFT_CMP_EQ,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
};
return __nft_cmp_offload(ctx, flow, &cmp);
}
-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
struct nft_data data;
if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
goto nla_put_failure;
data.data[0] = priv->data;
@@ -233,15 +282,114 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.eval = NULL, /* inlined */
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_cmp_fast_offload,
};
+static u32 nft_cmp_mask(u32 bitlen)
+{
+ return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+ int len = bitlen / BITS_PER_BYTE;
+ int i, words = len / sizeof(u32);
+
+ for (i = 0; i < words; i++) {
+ data->data[i] = 0xffffffff;
+ bitlen -= sizeof(u32) * BITS_PER_BYTE;
+ }
+
+ if (len % sizeof(u32))
+ data->data[i++] = nft_cmp_mask(bitlen);
+
+ for (; i < 4; i++)
+ data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+ priv->len = desc.len;
+
+ return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = priv->data,
+ .sreg = priv->sreg,
+ .len = priv->len,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp16_fast_init,
+ .dump = nft_cmp16_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp16_fast_offload,
+};
+
static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
- struct nft_data_desc desc;
struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
enum nft_cmp_ops op;
+ u8 sreg;
int err;
if (tb[NFTA_CMP_SREG] == NULL ||
@@ -262,23 +410,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
- return &nft_cmp_fast_ops;
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+ if (desc.len <= sizeof(u32))
+ return &nft_cmp_fast_ops;
+ else if (desc.len <= sizeof(data) &&
+ ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+ (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+ return &nft_cmp16_fast_ops;
+ }
return &nft_cmp_ops;
-err1:
- nft_data_release(&data, desc.type);
- return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f9adca62ccb3..72711d62fddf 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -19,6 +19,7 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_arp/arp_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_log.h>
/* Used for matches where *info is larger than X byte */
#define NFT_MATCH_LARGE_THRESH 192
@@ -57,8 +58,13 @@ union nft_entry {
};
static inline void
-nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+nft_compat_set_par(struct xt_action_param *par,
+ const struct nft_pktinfo *pkt,
+ const void *xt, const void *xt_info)
{
+ par->state = pkt->state;
+ par->thoff = nft_thoff(pkt);
+ par->fragoff = pkt->fragoff;
par->target = xt;
par->targinfo = xt_info;
par->hotdrop = false;
@@ -71,13 +77,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -97,13 +104,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -127,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
[NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TARGET_INFO] = { .type = NLA_BINARY },
};
@@ -192,6 +200,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1]
static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
{
struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 l4proto;
u32 flags;
int err;
@@ -204,15 +213,32 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return -EINVAL;
flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
- if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ if (flags & NFT_RULE_COMPAT_F_UNUSED ||
+ flags & ~NFT_RULE_COMPAT_F_MASK)
return -EINVAL;
if (flags & NFT_RULE_COMPAT_F_INV)
*inv = true;
- *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ if (l4proto > U16_MAX)
+ return -EINVAL;
+
+ *proto = l4proto;
+
return 0;
}
+static void nft_compat_wait_for_destructors(struct net *net)
+{
+ /* xtables matches or targets can have side effects, e.g.
+ * creation/destruction of /proc files.
+ * The xt ->destroy functions are run asynchronously from
+ * work queue. If we have pending invocations we thus
+ * need to wait for those to finish.
+ */
+ nf_tables_trans_destroy_flush_work(net);
+}
+
static int
nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -236,9 +262,25 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+ nft_compat_wait_for_destructors(ctx->net);
+
ret = xt_check_target(&par, size, proto, inv);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ const char *modname = NULL;
+
+ if (strcmp(target->name, "LOG") == 0)
+ modname = "nf_log_syslog";
+ else if (strcmp(target->name, "NFLOG") == 0)
+ modname = "nfnetlink_log";
+
+ if (modname &&
+ nft_request_module(ctx->net, "%s", modname) == -EAGAIN)
+ return -EAGAIN;
+ }
+
return ret;
+ }
/* The standard target cannot be used */
if (!target->target)
@@ -247,6 +289,12 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
+static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr)
+{
+ module_put(me);
+ kfree(expr->ops);
+}
+
static void
nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -262,8 +310,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
if (par.target->destroy != NULL)
par.target->destroy(&par);
- module_put(me);
- kfree(expr->ops);
+ __nft_mt_tg_destroy(me, expr);
}
static int nft_extension_dump_info(struct sk_buff *skb, int attr,
@@ -284,7 +331,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr,
return 0;
}
-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
@@ -302,13 +350,28 @@ nla_put_failure:
}
static int nft_target_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_target *target = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -332,13 +395,14 @@ static void __nft_match_eval(const struct nft_expr *expr,
{
struct xt_match *match = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
bool ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+ nft_compat_set_par(&xt, pkt, match, info);
- ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+ ret = match->match(skb, &xt);
- if (pkt->xt.hotdrop) {
+ if (xt.hotdrop) {
regs->verdict.code = NF_DROP;
return;
}
@@ -371,7 +435,7 @@ static void nft_match_eval(const struct nft_expr *expr,
static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
[NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_MATCH_INFO] = { .type = NLA_BINARY },
};
@@ -451,6 +515,8 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+ nft_compat_wait_for_destructors(ctx->net);
+
return xt_check_match(&par, size, proto, inv);
}
@@ -469,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_match *m = expr->ops->data;
int ret;
- priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+ priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT);
if (!priv->info)
return -ENOMEM;
@@ -494,8 +560,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (par.match->destroy != NULL)
par.match->destroy(&par);
- module_put(me);
- kfree(expr->ops);
+ __nft_mt_tg_destroy(me, expr);
}
static void
@@ -530,12 +595,14 @@ nla_put_failure:
return -1;
}
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
return __nft_match_dump(skb, expr, nft_expr_priv(expr));
}
-static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+static int nft_match_large_dump(struct sk_buff *skb,
+ const struct nft_expr *e, bool reset)
{
struct nft_xt_match_priv *priv = nft_expr_priv(e);
@@ -543,13 +610,28 @@ static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
}
static int nft_match_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_match *match = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -572,19 +654,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int rev, int target)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
@@ -599,17 +676,15 @@ nla_put_failure:
return -1;
}
-static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ u8 family = info->nfmsg->nfgen_family;
+ const char *name, *fmt;
+ struct sk_buff *skb2;
int ret = 0, target;
- struct nfgenmsg *nfmsg;
- const char *fmt;
- const char *name;
u32 rev;
- struct sk_buff *skb2;
if (tb[NFTA_COMPAT_NAME] == NULL ||
tb[NFTA_COMPAT_REV] == NULL ||
@@ -620,9 +695,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
- nfmsg = nlmsg_data(nlh);
-
- switch(nfmsg->nfgen_family) {
+ switch(family) {
case AF_INET:
fmt = "ipt_%s";
break;
@@ -636,8 +709,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
fmt = "arpt_%s";
break;
default:
- pr_err("nft_compat: unsupported protocol %d\n",
- nfmsg->nfgen_family);
+ pr_err("nft_compat: unsupported protocol %d\n", family);
return -EINVAL;
}
@@ -645,9 +717,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
return -EINVAL;
rcu_read_unlock();
- try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
- rev, target, &ret),
- fmt, name);
+ try_then_request_module(xt_find_revision(family, name, rev, target, &ret),
+ fmt, name);
if (ret < 0)
goto out_put;
@@ -659,36 +730,36 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
/* include the best revision for this extension in the message */
if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_COMPAT_GET,
- nfmsg->nfgen_family,
- name, ret, target) <= 0) {
+ family, name, ret, target) <= 0) {
kfree_skb(skb2);
goto out_put;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
out_put:
rcu_read_lock();
module_put(THIS_MODULE);
- return ret == -EAGAIN ? -ENOBUFS : ret;
+
+ return ret;
}
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
[NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
.len = NFT_COMPAT_NAME_MAX-1 },
- [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
};
static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
- [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
- .attr_count = NFTA_COMPAT_MAX,
- .policy = nfnl_compat_policy_get },
+ [NFNL_MSG_COMPAT_GET] = {
+ .call = nfnl_compat_get_rcu,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get
+ },
};
static const struct nfnetlink_subsystem nfnl_compat_subsys = {
@@ -700,6 +771,14 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
static struct nft_expr_type nft_match_type;
+static bool nft_match_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct xt_match *match = expr->ops->data;
+
+ return strcmp(match->name, "comment") == 0;
+}
+
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -729,7 +808,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
@@ -742,6 +821,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_match_dump;
ops->validate = nft_match_validate;
ops->data = match;
+ ops->reduce = nft_match_reduce;
matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
if (matchsize > NFT_MATCH_LARGE_THRESH) {
@@ -818,7 +898,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
goto err;
}
- ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL);
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
if (!ops) {
err = -ENOMEM;
goto err;
@@ -831,6 +911,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,
ops->dump = nft_target_dump;
ops->validate = nft_target_validate;
ops->data = target;
+ ops->reduce = NFT_REDUCE_READONLY;
if (family == NFPROTO_BRIDGE)
ops->eval = nft_target_eval_bridge;
@@ -902,3 +983,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("match");
MODULE_ALIAS_NFT_EXPR("target");
+MODULE_DESCRIPTION("x_tables over nftables support");
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 69d6173f91e2..657764774a2d 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- struct nf_conncount_list list;
+ struct nf_conncount_list *list;
u32 limit;
bool invert;
};
@@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
const struct nft_pktinfo *pkt,
const struct nft_set_ext *ext)
{
- const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
- const struct nf_conntrack_tuple *tuple_ptr;
- struct nf_conntrack_tuple tuple;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn *ct;
unsigned int count;
+ int err;
- tuple_ptr = &tuple;
-
- ct = nf_ct_get(pkt->skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
- nft_pf(pkt), nft_net(pkt), &tuple)) {
- regs->verdict.code = NF_DROP;
- return;
- }
-
- if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
- regs->verdict.code = NF_DROP;
- return;
+ err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list);
+ if (err) {
+ if (err == -EEXIST) {
+ /* Call gc to update the list count if any connection has
+ * been closed already. This is useful for softlimit
+ * connections like limiting bandwidth based on a number
+ * of open connections.
+ */
+ nf_conncount_gc_list(nft_net(pkt), priv->list);
+ } else {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
}
- count = priv->list.count;
+ count = READ_ONCE(priv->list->count);
- if ((count > priv->limit) ^ priv->invert) {
+ if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -62,6 +56,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
{
bool invert = false;
u32 flags, limit;
+ int err;
if (!tb[NFTA_CONNLIMIT_COUNT])
return -EINVAL;
@@ -76,18 +71,31 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- nf_conncount_list_init(&priv->list);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
+ if (!priv->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv->list);
priv->limit = limit;
priv->invert = invert;
- return nf_ct_netns_get(ctx->net, ctx->family);
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err < 0)
+ goto err_netns;
+
+ return 0;
+err_netns:
+ kfree(priv->list);
+
+ return err;
}
static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -123,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
return nft_connlimit_do_init(ctx, tb, priv);
}
+static void nft_connlimit_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_connlimit *newpriv = nft_obj_data(newobj);
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ WRITE_ONCE(priv->limit, newpriv->limit);
+ WRITE_ONCE(priv->invert, newpriv->invert);
+}
+
static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
struct nft_object *obj)
{
@@ -152,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = {
.init = nft_connlimit_obj_init,
.destroy = nft_connlimit_obj_destroy,
.dump = nft_connlimit_obj_dump,
+ .update = nft_connlimit_obj_update,
};
static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
@@ -171,7 +190,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr,
nft_connlimit_do_eval(priv, regs, pkt, NULL);
}
-static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_connlimit_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
@@ -195,12 +215,16 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx,
nft_connlimit_do_destroy(ctx, priv);
}
-static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- nf_conncount_list_init(&priv_dst->list);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp);
+ if (!priv_dst->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -212,19 +236,15 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- bool ret;
-
- local_bh_disable();
- ret = nf_conncount_gc_list(net, &priv->list);
- local_bh_enable();
- return ret;
+ return nf_conncount_gc_list(net, priv->list);
}
static struct nft_expr_type nft_connlimit_type;
@@ -238,6 +258,7 @@ static const struct nft_expr_ops nft_connlimit_ops = {
.destroy_clone = nft_connlimit_destroy_clone,
.dump = nft_connlimit_dump,
.gc = nft_connlimit_gc,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_connlimit_type __read_mostly = {
@@ -280,3 +301,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso");
MODULE_ALIAS_NFT_EXPR("connlimit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
+MODULE_DESCRIPTION("nftables connlimit rule support");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f6d4d0fa23a6..cc7325329496 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -8,13 +8,20 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
+ u64_stats_t bytes;
+ u64_stats_t packets;
+};
+
+struct nft_counter_tot {
s64 bytes;
s64 packets;
};
@@ -23,25 +30,24 @@ struct nft_counter_percpu_priv {
struct nft_counter __percpu *counter;
};
-static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync);
static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
-
- write_seqcount_begin(myseq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- this_cpu->bytes += pkt->skb->len;
- this_cpu->packets++;
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->bytes, pkt->skb->len);
+ u64_stats_inc(&this_cpu->packets);
+ u64_stats_update_end(nft_sync);
- write_seqcount_end(myseq);
local_bh_enable();
}
@@ -60,21 +66,20 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu = raw_cpu_ptr(cpu_stats);
if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ u64_stats_set(&this_cpu->packets,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])));
}
if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ u64_stats_set(&this_cpu->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])));
}
- preempt_enable();
+
priv->counter = cpu_stats;
return 0;
}
@@ -102,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
}
static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- this_cpu->packets -= total->packets;
- this_cpu->bytes -= total->bytes;
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, -total->packets);
+ u64_stats_add(&this_cpu->bytes, -total->bytes);
+ u64_stats_update_end(nft_sync);
+
local_bh_enable();
}
static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
struct nft_counter *this_cpu;
- const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu);
+
this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = read_seqcount_begin(myseq);
- bytes = this_cpu->bytes;
- packets = this_cpu->packets;
- } while (read_seqcount_retry(myseq, seq));
+ seq = u64_stats_fetch_begin(nft_sync);
+ bytes = u64_stats_read(&this_cpu->bytes);
+ packets = u64_stats_read(&this_cpu->packets);
+ } while (u64_stats_fetch_retry(nft_sync, seq));
total->bytes += bytes;
total->packets += packets;
@@ -141,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb,
struct nft_counter_percpu_priv *priv,
bool reset)
{
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
@@ -173,7 +184,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
-static struct nft_object_type nft_counter_obj_type;
+struct nft_object_type nft_counter_obj_type;
static const struct nft_object_ops nft_counter_obj_ops = {
.type = &nft_counter_obj_type,
.size = sizeof(struct nft_counter_percpu_priv),
@@ -183,7 +194,7 @@ static const struct nft_object_ops nft_counter_obj_ops = {
.dump = nft_counter_obj_dump,
};
-static struct nft_object_type nft_counter_obj_type __read_mostly = {
+struct nft_object_type nft_counter_obj_type __read_mostly = {
.type = NFT_OBJECT_COUNTER,
.ops = &nft_counter_obj_ops,
.maxattr = NFTA_COUNTER_MAX,
@@ -191,20 +202,20 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
nft_counter_do_eval(priv, regs, pkt);
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- return nft_counter_do_dump(skb, priv, false);
+ return nft_counter_do_dump(skb, priv, reset);
}
static int nft_counter_init(const struct nft_ctx *ctx,
@@ -224,31 +235,63 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
nft_counter_do_destroy(priv);
}
-static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
- cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp);
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->packets = total.packets;
- this_cpu->bytes = total.bytes;
- preempt_enable();
+ this_cpu = raw_cpu_ptr(cpu_stats);
+ u64_stats_set(&this_cpu->packets, total.packets);
+ u64_stats_set(&this_cpu->bytes, total.bytes);
priv_clone->counter = cpu_stats;
return 0;
}
-static struct nft_expr_type nft_counter_type;
+static int nft_counter_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ /* No specific offload action is needed, but report success. */
+ return 0;
+}
+
+static void nft_counter_offload_stats(struct nft_expr *expr,
+ const struct flow_stats *stats)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct u64_stats_sync *nft_sync;
+ struct nft_counter *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, stats->pkts);
+ u64_stats_add(&this_cpu->bytes, stats->bytes);
+ u64_stats_update_end(nft_sync);
+ local_bh_enable();
+}
+
+void nft_counter_init_seqcount(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu));
+}
+
+struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
@@ -258,9 +301,12 @@ static const struct nft_expr_ops nft_counter_ops = {
.destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_counter_offload,
+ .offload_stats = nft_counter_offload_stats,
};
-static struct nft_expr_type nft_counter_type __read_mostly = {
+struct nft_expr_type nft_counter_type __read_mostly = {
.name = "counter",
.ops = &nft_counter_ops,
.policy = nft_counter_policy,
@@ -268,38 +314,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
.flags = NFT_EXPR_STATEFUL,
.owner = THIS_MODULE,
};
-
-static int __init nft_counter_module_init(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
-
- err = nft_register_obj(&nft_counter_obj_type);
- if (err < 0)
- return err;
-
- err = nft_register_expr(&nft_counter_type);
- if (err < 0)
- goto err1;
-
- return 0;
-err1:
- nft_unregister_obj(&nft_counter_obj_type);
- return err;
-}
-
-static void __exit nft_counter_module_exit(void)
-{
- nft_unregister_expr(&nft_counter_type);
- nft_unregister_obj(&nft_counter_obj_type);
-}
-
-module_init(nft_counter_module_init);
-module_exit(nft_counter_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("counter");
-MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index faea72c2df32..6f2ae7cad731 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -12,7 +12,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_tuple.h>
@@ -22,15 +22,7 @@
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
-
-struct nft_ct {
- enum nft_ct_keys key:8;
- enum ip_conntrack_dir dir:8;
- union {
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
- };
-};
+#include <net/netfilter/nf_conntrack_seqadj.h>
struct nft_ct_helper_obj {
struct nf_conntrack_helper *helper4;
@@ -41,6 +33,7 @@ struct nft_ct_helper_obj {
#ifdef CONFIG_NF_CONNTRACK_ZONES
static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+static DEFINE_MUTEX(nft_ct_pcpu_mutex);
#endif
static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
@@ -96,7 +89,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- *dest = ct->mark;
+ *dest = READ_ONCE(ct->mark);
return;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
@@ -116,7 +109,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
helper = rcu_dereference(help->helper);
if (helper == NULL)
goto err;
- strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
+ strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
return;
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS: {
@@ -129,7 +122,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
}
#endif
- case NFT_CT_BYTES: /* fallthrough */
+ case NFT_CT_BYTES:
case NFT_CT_PKTS: {
const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
u64 count = 0;
@@ -177,8 +170,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
}
#endif
case NFT_CT_ID:
- if (!nf_ct_is_confirmed(ct))
- goto err;
*dest = nf_ct_get_id(ct);
return;
default:
@@ -204,12 +195,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
case NFT_CT_SRC_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->src.u3.ip;
+ *dest = (__force __u32)tuple->src.u3.ip;
return;
case NFT_CT_DST_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->dst.u3.ip;
+ *dest = (__force __u32)tuple->dst.u3.ip;
return;
case NFT_CT_SRC_IP6:
if (nf_ct_l3num(ct) != NFPROTO_IPV6)
@@ -240,6 +231,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
enum ip_conntrack_info ctinfo;
u16 value = nft_reg_load16(&regs->data[priv->sreg]);
struct nf_conn *ct;
+ int oldcnt;
ct = nf_ct_get(skb, &ctinfo);
if (ct) /* already tracked */
@@ -260,18 +252,22 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+ __refcount_inc(&ct->ct_general.use, &oldcnt);
+ if (likely(oldcnt == 1)) {
nf_ct_zone_add(ct, &zone);
} else {
- /* previous skb got queued to userspace */
+ refcount_dec(&ct->ct_general.use);
+ /* previous skb got queued to userspace, allocate temporary
+ * one until percpu template can be reused.
+ */
ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
if (!ct) {
regs->verdict.code = NF_DROP;
return;
}
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
}
- atomic_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
}
#endif
@@ -295,8 +291,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- if (ct->mark != value) {
- ct->mark = value;
+ if (READ_ONCE(ct->mark) != value) {
+ WRITE_ONCE(ct->mark, value);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -340,7 +336,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_DREG] = { .type = NLA_U32 },
- [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_CT_DIRECTION] = { .type = NLA_U8 },
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
@@ -376,7 +372,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
return false;
}
- atomic_set(&tmp->ct_general.use, 1);
+ __set_bit(IPS_CONFIRMED_BIT, &tmp->status);
per_cpu(nft_ct_pcpu_template, cpu) = tmp;
}
@@ -384,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
}
#endif
+static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ if (priv->key == NFT_CT_LABELS)
+ nf_connlabels_put(ctx->net);
+#endif
+}
+
static int nft_ct_get_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -418,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DIRECTION] != NULL)
return -EINVAL;
len = NF_CT_LABELS_MAX_SIZE;
+
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
break;
#endif
case NFT_CT_HELPER:
@@ -483,6 +491,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
break;
#endif
case NFT_CT_ID:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+
len = sizeof(u32);
break;
default:
@@ -496,19 +507,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
case IP_CT_DIR_REPLY:
break;
default:
- return -EINVAL;
+ err = -EINVAL;
+ goto err;
}
}
- priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
if (err < 0)
- return err;
+ goto err;
err = nf_ct_netns_get(ctx->net, ctx->family);
if (err < 0)
- return err;
+ goto err;
if (priv->key == NFT_CT_BYTES ||
priv->key == NFT_CT_PKTS ||
@@ -516,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
nf_ct_set_acct(ctx->net, true);
return 0;
+err:
+ __nft_ct_get_destroy(ctx, priv);
+ return err;
}
static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
@@ -528,8 +543,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
+ mutex_lock(&nft_ct_pcpu_mutex);
if (--nft_ct_pcpu_template_refcnt == 0)
nft_ct_tmpl_put_pcpu();
+ mutex_unlock(&nft_ct_pcpu_mutex);
+ break;
#endif
default:
break;
@@ -566,9 +584,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
- if (!nft_ct_tmpl_alloc_pcpu())
+ mutex_lock(&nft_ct_pcpu_mutex);
+ if (!nft_ct_tmpl_alloc_pcpu()) {
+ mutex_unlock(&nft_ct_pcpu_mutex);
return -ENOMEM;
+ }
nft_ct_pcpu_template_refcnt++;
+ mutex_unlock(&nft_ct_pcpu_mutex);
len = sizeof(u16);
break;
#endif
@@ -602,8 +624,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
}
}
- priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len);
if (err < 0)
goto err1;
@@ -621,6 +643,9 @@ err1:
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ __nft_ct_get_destroy(ctx, priv);
nf_ct_netns_put(ctx->net, ctx->family);
}
@@ -633,7 +658,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -672,7 +698,31 @@ nla_put_failure:
return -1;
}
-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static bool nft_ct_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ const struct nft_ct *ct;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ ct = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != ct->key) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_ct_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -705,7 +755,38 @@ static const struct nft_expr_ops nft_ct_get_ops = {
.init = nft_ct_get_init,
.destroy = nft_ct_get_destroy,
.dump = nft_ct_get_dump,
+ .reduce = nft_ct_get_reduce,
+};
+
+static bool nft_ct_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_ct_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static const struct nft_expr_ops nft_ct_get_fast_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_fast_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_get_destroy,
+ .dump = nft_ct_get_dump,
+ .reduce = nft_ct_set_reduce,
};
+#endif
static const struct nft_expr_ops nft_ct_set_ops = {
.type = &nft_ct_type,
@@ -714,6 +795,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -724,6 +806,7 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#endif
@@ -737,8 +820,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
return ERR_PTR(-EINVAL);
- if (tb[NFTA_CT_DREG])
+ if (tb[NFTA_CT_DREG]) {
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+
+ switch (k) {
+ case NFT_CT_STATE:
+ case NFT_CT_DIRECTION:
+ case NFT_CT_STATUS:
+ case NFT_CT_MARK:
+ case NFT_CT_SECMARK:
+ return &nft_ct_get_fast_ops;
+ }
+#endif
return &nft_ct_get_ops;
+ }
if (tb[NFTA_CT_SREG]) {
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -780,6 +876,7 @@ static const struct nft_expr_ops nft_notrack_ops = {
.type = &nft_notrack_type,
.size = NFT_EXPR_SIZE(0),
.eval = nft_notrack_eval,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_notrack_type __read_mostly = {
@@ -854,7 +951,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj,
*/
values = nf_ct_timeout_data(timeout);
if (values)
- nf_ct_refresh(ct, pkt->skb, values[0]);
+ nf_ct_refresh(ct, values[0]);
}
static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
@@ -990,7 +1087,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
if (!priv->l4proto)
return -ENOENT;
- nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+ nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
if (tb[NFTA_CT_HELPER_L3PROTO])
family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
@@ -1013,8 +1110,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
help6 = nf_conntrack_helper_try_module_get(name, family,
priv->l4proto);
break;
- case NFPROTO_NETDEV: /* fallthrough */
- case NFPROTO_BRIDGE: /* same */
+ case NFPROTO_NETDEV:
+ case NFPROTO_BRIDGE:
case NFPROTO_INET:
help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4,
priv->l4proto);
@@ -1096,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
if (help) {
rcu_assign_pointer(help->helper, to_assign);
set_bit(IPS_HELPER_BIT, &ct->status);
+
+ if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct))
+ if (!nfct_seqadj_ext_add(ct))
+ regs->verdict.code = NF_DROP;
}
}
@@ -1178,7 +1279,30 @@ static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_EXPECT_L3PROTO])
priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+ switch (priv->l3num) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET)
+ break;
+
+ return -EINVAL;
+ case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */
+ default:
+ return -EAFNOSUPPORT;
+ }
+
priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ switch (priv->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_DCCP:
+ case IPPROTO_SCTP:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
@@ -1220,7 +1344,7 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj,
struct nf_conn *ct;
ct = nf_ct_get(pkt->skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_UNTRACKED) {
+ if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -1345,3 +1469,4 @@ MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
+MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c
new file mode 100644
index 000000000000..e684c8a91848
--- /dev/null
+++ b/net/netfilter/nft_ct_fast.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#if IS_ENABLED(CONFIG_NFT_CT)
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack.h>
+
+void nft_ct_get_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct)
+ state = NF_CT_STATE_BIT(ctinfo);
+ else if (ctinfo == IP_CT_UNTRACKED)
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_INVALID_BIT;
+ *dest = state;
+ return;
+ default:
+ break;
+ }
+
+ if (!ct) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ nft_reg_store8(dest, CTINFO2DIR(ctinfo));
+ return;
+ case NFT_CT_STATUS:
+ *dest = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ *dest = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ *dest = ct->secmark;
+ return;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval);
+#endif
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index c2e78c160fd7..0573f96ce079 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_dup_netdev.h>
struct nft_dup_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_dup_netdev_eval(const struct nft_expr *expr,
@@ -40,11 +40,12 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
-static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_netdev *priv = nft_expr_priv(expr);
@@ -67,6 +68,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif);
}
+static bool nft_dup_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
static struct nft_expr_type nft_dup_netdev_type;
static const struct nft_expr_ops nft_dup_netdev_ops = {
.type = &nft_dup_netdev_type,
@@ -74,7 +80,9 @@ static const struct nft_expr_ops nft_dup_netdev_ops = {
.eval = nft_dup_netdev_eval,
.init = nft_dup_netdev_init,
.dump = nft_dup_netdev_dump,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_dup_netdev_offload,
+ .offload_action = nft_dup_netdev_offload_action,
};
static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
@@ -102,3 +110,4 @@ module_exit(nft_dup_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(5, "dup");
+MODULE_DESCRIPTION("nftables netdev packet duplication support");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 64ca13a1885b..7807d8129664 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -16,42 +16,62 @@ struct nft_dynset {
struct nft_set *set;
struct nft_set_ext_tmpl tmpl;
enum nft_dynset_ops op:8;
- enum nft_registers sreg_key:8;
- enum nft_registers sreg_data:8;
+ u8 sreg_key;
+ u8 sreg_data;
bool invert;
+ bool expr;
+ u8 num_exprs;
u64 timeout;
- struct nft_expr *expr;
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX];
struct nft_set_binding binding;
};
-static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
- struct nft_regs *regs)
+static int nft_dynset_expr_setup(const struct nft_dynset *priv,
+ const struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0)
+ return -1;
+
+ elem_expr->size += priv->expr_array[i]->ops->size;
+ }
+
+ return 0;
+}
+
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+ const struct nft_expr *expr,
+ struct nft_regs *regs)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set_ext *ext;
+ void *elem_priv;
u64 timeout;
- void *elem;
if (!atomic_add_unless(&set->nelems, 1, set->size))
return NULL;
- timeout = priv->timeout ? : set->timeout;
- elem = nft_set_elem_init(set, &priv->tmpl,
- &regs->data[priv->sreg_key], NULL,
- &regs->data[priv->sreg_data],
- timeout, 0, GFP_ATOMIC);
- if (elem == NULL)
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ elem_priv = nft_set_elem_init(set, &priv->tmpl,
+ &regs->data[priv->sreg_key], NULL,
+ &regs->data[priv->sreg_data],
+ timeout, 0, GFP_ATOMIC);
+ if (IS_ERR(elem_priv))
goto err1;
- ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL &&
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ ext = nft_set_elem_ext(set, elem_priv);
+ if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
goto err2;
- return elem;
+ return elem_priv;
err2:
- nft_set_elem_destroy(set, elem, false);
+ nft_set_elem_destroy(set, elem_priv, false);
err1:
if (set->size)
atomic_dec(&set->nelems);
@@ -71,12 +91,13 @@ void nft_dynset_eval(const struct nft_expr *expr,
return;
}
- if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
- expr, regs, &ext)) {
+ ext = set->ops->update(set, &regs->data[priv->sreg_key], expr, regs);
+ if (ext) {
if (priv->op == NFT_DYNSET_OP_UPDATE &&
- nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- timeout = priv->timeout ? : set->timeout;
- *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
+ nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
+ READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) {
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout);
}
nft_set_elem_update_expr(ext, regs, pkt);
@@ -90,29 +111,66 @@ void nft_dynset_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_dynset_ext_add_expr(struct nft_dynset *priv)
+{
+ u8 size = 0;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++)
+ size += priv->expr_array[i]->ops->size;
+
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+}
+
+static struct nft_expr *
+nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, int pos)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_set_elem_expr_alloc(ctx, set, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_dynset_expr;
+ }
+
+ return expr;
+
+err_dynset_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
- [NFTA_DYNSET_OP] = { .type = NLA_U32 },
+ [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
[NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_dynset *priv = nft_expr_priv(expr);
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u64 timeout;
- int err;
+ int err, i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
tb[NFTA_DYNSET_OP] == NULL ||
@@ -121,11 +179,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
-
- if (flags & ~NFT_DYNSET_F_INV)
- return -EINVAL;
+ if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR))
+ return -EOPNOTSUPP;
if (flags & NFT_DYNSET_F_INV)
priv->invert = true;
+ if (flags & NFT_DYNSET_F_EXPR)
+ priv->expr = true;
}
set = nft_set_lookup_global(ctx->net, ctx->table,
@@ -134,6 +193,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
+ if (set->flags & NFT_SET_OBJECT)
+ return -EOPNOTSUPP;
+
if (set->ops->update == NULL)
return -EOPNOTSUPP;
@@ -141,70 +203,119 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return -EBUSY;
priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
- switch (priv->op) {
- case NFT_DYNSET_OP_ADD:
- case NFT_DYNSET_OP_DELETE:
- break;
- case NFT_DYNSET_OP_UPDATE:
- if (!(set->flags & NFT_SET_TIMEOUT))
- return -EOPNOTSUPP;
- break;
- default:
+ if (priv->op > NFT_DYNSET_OP_DELETE)
return -EOPNOTSUPP;
- }
timeout = 0;
if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
- return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- tb[NFTA_DYNSET_TIMEOUT])));
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
- priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
- err = nft_validate_register_load(priv->sreg_key, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key,
+ set->klen);
if (err < 0)
return err;
if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
if (!(set->flags & NFT_SET_MAP))
- return -EINVAL;
+ return -EOPNOTSUPP;
if (set->dtype == NFT_DATA_VERDICT)
return -EOPNOTSUPP;
- priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]);
- err = nft_validate_register_load(priv->sreg_data, set->dlen);
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA],
+ &priv->sreg_data, set->dlen);
if (err < 0)
return err;
} else if (set->flags & NFT_SET_MAP)
return -EINVAL;
- if (tb[NFTA_DYNSET_EXPR] != NULL) {
- if (!(set->flags & NFT_SET_EVAL))
- return -EINVAL;
+ if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) &&
+ !(set->flags & NFT_SET_EVAL))
+ return -EINVAL;
+
+ if (tb[NFTA_DYNSET_EXPR]) {
+ struct nft_expr *dynset_expr;
+
+ dynset_expr = nft_dynset_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR], 0);
+ if (IS_ERR(dynset_expr))
+ return PTR_ERR(dynset_expr);
+
+ priv->num_exprs++;
+ priv->expr_array[0] = dynset_expr;
+
+ if (set->num_exprs > 1 ||
+ (set->num_exprs == 1 &&
+ dynset_expr->ops != set->exprs[0]->ops)) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ } else if (tb[NFTA_DYNSET_EXPRESSIONS]) {
+ struct nft_expr *dynset_expr;
+ struct nlattr *tmp;
+ int left;
- priv->expr = nft_set_elem_expr_alloc(ctx, set,
- tb[NFTA_DYNSET_EXPR]);
- if (IS_ERR(priv->expr))
- return PTR_ERR(priv->expr);
+ if (!priv->expr)
+ return -EINVAL;
- if (set->expr && set->expr->ops != priv->expr->ops) {
+ i = 0;
+ nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_expr_free;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i);
+ if (IS_ERR(dynset_expr)) {
+ err = PTR_ERR(dynset_expr);
+ goto err_expr_free;
+ }
+ priv->expr_array[i] = dynset_expr;
+ priv->num_exprs++;
+
+ if (set->num_exprs) {
+ if (i >= set->num_exprs) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ if (dynset_expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
err = -EOPNOTSUPP;
goto err_expr_free;
}
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, priv->expr_array);
+ if (err < 0)
+ return err;
+
+ priv->num_exprs = set->num_exprs;
}
nft_set_ext_prepare(&priv->tmpl);
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
if (set->flags & NFT_SET_MAP)
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen);
- if (priv->expr != NULL)
- nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR,
- priv->expr->ops->size);
- if (set->flags & NFT_SET_TIMEOUT) {
- if (timeout || set->timeout)
- nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
- }
+
+ if (priv->num_exprs)
+ nft_dynset_ext_add_expr(priv);
+
+ if (set->flags & NFT_SET_TIMEOUT &&
+ (timeout || READ_ONCE(set->timeout)))
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
priv->timeout = timeout;
@@ -219,8 +330,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return 0;
err_expr_free:
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
return err;
}
@@ -238,24 +349,27 @@ static void nft_dynset_activate(const struct nft_ctx *ctx,
{
struct nft_dynset *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_dynset_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
+ int i;
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dynset_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
+ int i;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -267,11 +381,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(priv->timeout)),
+ nf_jiffies64_to_msecs(priv->timeout),
NFTA_DYNSET_PAD))
goto nla_put_failure;
- if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
- goto nla_put_failure;
+ if (priv->set->num_exprs == 0) {
+ if (priv->num_exprs == 1) {
+ if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
+ priv->expr_array[0], reset))
+ goto nla_put_failure;
+ } else if (priv->num_exprs > 1) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ priv->expr_array[i], reset))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ }
if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
@@ -289,6 +421,7 @@ static const struct nft_expr_ops nft_dynset_ops = {
.activate = nft_dynset_activate,
.deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_dynset_type __read_mostly = {
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 07782836fad6..7eedf4e3ae9c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -5,11 +5,13 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/tcp.h>
@@ -19,8 +21,8 @@ struct nft_exthdr {
u8 offset;
u8 len;
u8 op;
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
+ u8 dreg;
+ u8 sreg;
u8 flags;
};
@@ -33,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset)
return opt[offset + 1];
}
+static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len)
+{
+ if (len % NFT_REG32_SIZE)
+ dest[len / NFT_REG32_SIZE] = 0;
+
+ return skb_copy_bits(skb, offset, dest, len);
+}
+
static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -42,17 +52,19 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
unsigned int offset = 0;
int err;
+ if (pkt->skb->protocol != htons(ETH_P_IPV6))
+ goto err;
+
err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = (err >= 0);
+ nft_reg_store8(dest, err >= 0);
return;
} else if (err < 0) {
goto err;
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
goto err;
return;
err:
@@ -73,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct iphdr *iph, _iph;
- unsigned int start;
bool found = false;
__be32 info;
int optlen;
@@ -81,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (!iph)
return -EBADMSG;
- start = sizeof(struct iphdr);
optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
if (optlen <= 0)
@@ -91,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
/* Copy the options since __ip_options_compile() modifies
* the options.
*/
- if (skb_copy_bits(skb, start, opt->__data, optlen))
+ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen))
return -EBADMSG;
opt->optlen = optlen;
@@ -106,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
found = target == IPOPT_SSRR ? opt->is_strictroute :
!opt->is_strictroute;
if (found)
- *offset = opt->srr + start;
+ *offset = opt->srr;
break;
case IPOPT_RR:
if (!opt->rr)
break;
- *offset = opt->rr + start;
+ *offset = opt->rr;
found = true;
break;
case IPOPT_RA:
if (!opt->router_alert)
break;
- *offset = opt->router_alert + start;
+ *offset = opt->router_alert;
found = true;
break;
default:
@@ -141,15 +151,14 @@ static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = (err >= 0);
+ nft_reg_store8(dest, err >= 0);
return;
} else if (err < 0) {
goto err;
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
goto err;
return;
err:
@@ -162,10 +171,10 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
{
struct tcphdr *tcph;
- if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff)
return NULL;
- tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer);
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer);
if (!tcph)
return NULL;
@@ -173,7 +182,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len)
return NULL;
- return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer);
+ return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer);
}
static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
@@ -203,9 +212,10 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
offset = i + priv->offset;
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = 1;
+ nft_reg_store8(dest, 1);
} else {
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
memcpy(dest, opt + offset, priv->len);
}
@@ -231,9 +241,14 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
if (!tcph)
- return;
+ goto err;
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto err;
+
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
opt = (u8 *)tcph;
+
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
__be16 v16;
@@ -246,22 +261,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
continue;
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
- return;
-
- if (skb_ensure_writable(pkt->skb,
- pkt->xt.thoff + i + priv->len))
- return;
-
- tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
- &tcphdr_len);
- if (!tcph)
- return;
+ goto err;
offset = i + priv->offset;
switch (priv->len) {
case 2:
- old.v16 = get_unaligned((u16 *)(opt + offset));
+ old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset));
new.v16 = (__force __be16)nft_reg_load16(
&regs->data[priv->sreg]);
@@ -276,18 +282,18 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (old.v16 == new.v16)
return;
- put_unaligned(new.v16, (u16*)(opt + offset));
+ put_unaligned(new.v16, (__be16*)(opt + offset));
inet_proto_csum_replace2(&tcph->check, pkt->skb,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = regs->data[priv->sreg];
- old.v32 = get_unaligned((u32 *)(opt + offset));
+ new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]);
+ old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
return;
- put_unaligned(new.v32, (u32*)(opt + offset));
+ put_unaligned(new.v32, (__be32*)(opt + offset));
inet_proto_csum_replace4(&tcph->check, pkt->skb,
old.v32, new.v32, false);
break;
@@ -298,15 +304,194 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
return;
}
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, tcphdr_len, optl;
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!tcph)
+ goto err;
+
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto drop;
+
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
+ opt = (u8 *)tcph;
+
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ unsigned int j;
+
+ optl = optlen(opt, i);
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len)
+ goto drop;
+
+ for (j = 0; j < optl; ++j) {
+ u16 n = TCPOPT_NOP;
+ u16 o = opt[i+j];
+
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+ htons(n), false);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ return;
+ }
+
+ /* option not found, continue. This allows to do multiple
+ * option removals per rule.
+ */
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+ return;
+drop:
+ /* can't remove, no choice but to drop */
+ regs->verdict.code = NF_DROP;
+}
+
+static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr);
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+
+ if (pkt->tprot != IPPROTO_SCTP)
+ goto err;
+
+ do {
+ sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch);
+ if (!sch || !sch->length)
+ break;
+
+ if (sch->type == priv->type) {
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ nft_reg_store8(dest, true);
+ return;
+ }
+ if (priv->offset + priv->len > ntohs(sch->length) ||
+ offset + ntohs(sch->length) > pkt->skb->len)
+ break;
+
+ if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset,
+ dest, priv->len) < 0)
+ break;
+ return;
+ }
+ offset += SCTP_PAD4(ntohs(sch->length));
+ } while (offset < pkt->skb->len);
+err:
+ if (priv->flags & NFT_EXTHDR_F_PRESENT)
+ nft_reg_store8(dest, false);
+ else
+ regs->verdict.code = NFT_BREAK;
}
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static void nft_exthdr_dccp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int thoff, dataoff, optoff, optlen, i;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct dccp_hdr *dh;
+ struct dccp_hdr _dh;
+
+ if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff)
+ goto err;
+
+ thoff = nft_thoff(pkt);
+
+ dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh);
+ if (!dh)
+ goto err;
+
+ dataoff = dh->dccph_doff * sizeof(u32);
+ optoff = __dccp_hdr_len(dh);
+ if (dataoff <= optoff)
+ goto err;
+
+ optlen = dataoff - optoff;
+
+ for (i = 0; i < optlen; ) {
+ /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in
+ * the length; the remaining options are at least 2B long. In
+ * all cases, the first byte contains the option type. In
+ * multi-byte options, the second byte contains the option
+ * length, which must be at least two: 1 for the type plus 1 for
+ * the length plus 0-253 for any following option data. We
+ * aren't interested in the option data, only the type and the
+ * length, so we don't need to read more than two bytes at a
+ * time.
+ */
+ unsigned int buflen = optlen - i;
+ u8 buf[2], *bufp;
+ u8 type, len;
+
+ if (buflen > sizeof(buf))
+ buflen = sizeof(buf);
+
+ bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen,
+ &buf);
+ if (!bufp)
+ goto err;
+
+ type = bufp[0];
+
+ if (type == priv->type) {
+ nft_reg_store8(dest, 1);
+ return;
+ }
+
+ if (type <= DCCPO_MAX_RESERVED) {
+ i++;
+ continue;
+ }
+
+ if (buflen < 2)
+ goto err;
+
+ len = bufp[1];
+
+ if (len < 2)
+ goto err;
+
+ i += len;
+ }
+
+err:
+ *dest = 0;
+}
+#endif
+
static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
[NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
[NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
[NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
- [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
- [NFTA_EXTHDR_OP] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_SREG] = { .type = NLA_U32 },
};
@@ -350,12 +535,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
@@ -400,11 +585,33 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg,
+ priv->len);
+}
+
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_SREG] ||
+ tb[NFTA_EXTHDR_DREG] ||
+ tb[NFTA_EXTHDR_FLAGS] ||
+ tb[NFTA_EXTHDR_OFFSET] ||
+ tb[NFTA_EXTHDR_LEN])
+ return -EINVAL;
+
+ if (!tb[NFTA_EXTHDR_TYPE])
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+ return 0;
}
static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
@@ -429,6 +636,24 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
return 0;
}
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static int nft_exthdr_dccp_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ if (!(priv->flags & NFT_EXTHDR_F_PRESENT))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+#endif
+
static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
{
if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -447,7 +672,8 @@ nla_put_failure:
return -1;
}
-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -457,7 +683,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_set(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -467,12 +694,48 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
+static int nft_exthdr_dump_strip(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ return nft_exthdr_dump_common(skb, priv);
+}
+
+static bool nft_exthdr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+ const struct nft_exthdr *exthdr;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ exthdr = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->type != exthdr->type ||
+ priv->op != exthdr->op ||
+ priv->flags != exthdr->flags ||
+ priv->offset != exthdr->offset ||
+ priv->len != exthdr->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
.eval = nft_exthdr_ipv6_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
@@ -481,6 +744,7 @@ static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
.eval = nft_exthdr_ipv4_eval,
.init = nft_exthdr_ipv4_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
@@ -489,6 +753,7 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.eval = nft_exthdr_tcp_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
@@ -497,8 +762,38 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
.eval = nft_exthdr_tcp_set_eval,
.init = nft_exthdr_tcp_set_init,
.dump = nft_exthdr_dump_set,
+ .reduce = NFT_REDUCE_READONLY,
};
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_strip_eval,
+ .init = nft_exthdr_tcp_strip_init,
+ .dump = nft_exthdr_dump_strip,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nft_expr_ops nft_exthdr_sctp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_sctp_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static const struct nft_expr_ops nft_exthdr_dccp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_dccp_eval,
+ .init = nft_exthdr_dccp_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+#endif
+
static const struct nft_expr_ops *
nft_exthdr_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -518,7 +813,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_tcp_set_ops;
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_tcp_ops;
- break;
+ return &nft_exthdr_tcp_strip_ops;
case NFT_EXTHDR_OP_IPV6:
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
@@ -529,6 +824,16 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_ipv4_ops;
}
break;
+ case NFT_EXTHDR_OP_SCTP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_sctp_ops;
+ break;
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+ case NFT_EXTHDR_OP_DCCP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_dccp_ops;
+ break;
+#endif
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index cfac0964f48d..96e02a83c045 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -14,27 +14,29 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
+#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
+ NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
+ NFTA_FIB_F_PRESENT)
+
const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
[NFTA_FIB_DREG] = { .type = NLA_U32 },
[NFTA_FIB_RESULT] = { .type = NLA_U32 },
- [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
+ [NFTA_FIB_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL),
};
EXPORT_SYMBOL(nft_fib_policy);
-#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
- NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
- NFTA_FIB_F_PRESENT)
-
-int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_fib *priv = nft_expr_priv(expr);
unsigned int hooks;
switch (priv->result) {
- case NFT_FIB_RESULT_OIF: /* fallthrough */
+ case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
- hooks = (1 << NF_INET_PRE_ROUTING);
+ hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
break;
case NFT_FIB_RESULT_ADDRTYPE:
if (priv->flags & NFTA_FIB_F_IIF)
@@ -73,7 +75,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
- if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
+ if (priv->flags == 0)
return -EINVAL;
if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
@@ -86,7 +88,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
- priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
@@ -106,8 +107,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
if (err < 0)
return err;
@@ -115,7 +116,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
}
EXPORT_SYMBOL_GPL(nft_fib_init);
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
{
const struct nft_fib *priv = nft_expr_priv(expr);
@@ -141,13 +142,17 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
index = dev ? dev->ifindex : 0;
- *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ nft_reg_store8(dreg, !!index);
+ else
+ *dreg = index;
+
break;
case NFT_FIB_RESULT_OIFNAME:
if (priv->flags & NFTA_FIB_F_PRESENT)
- *dreg = !!dev;
+ nft_reg_store8(dreg, !!dev);
else
- strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
+ strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ);
break;
default:
WARN_ON_ONCE(1);
@@ -157,5 +162,48 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
}
EXPORT_SYMBOL_GPL(nft_fib_store_result);
+bool nft_fib_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len = NFT_REG32_SIZE;
+ const struct nft_fib *fib;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ len = NFT_REG32_SIZE;
+ else
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ fib = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->result != fib->result ||
+ priv->flags != fib->flags) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_fib_reduce);
+
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Query routing table from nftables");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
index 465432e0531b..666a3741d20b 100644
--- a/net/netfilter/nft_fib_inet.c
+++ b/net/netfilter/nft_fib_inet.c
@@ -49,6 +49,7 @@ static const struct nft_expr_ops nft_fib_inet_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_inet_type __read_mostly = {
@@ -76,3 +77,4 @@ module_exit(nft_fib_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
+MODULE_DESCRIPTION("nftables fib inet support");
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index a2e726ae7f07..9121ec64e918 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -58,6 +58,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
@@ -85,3 +86,4 @@ module_exit(nft_fib_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo M. Bermudo Garay <pablombg@gmail.com>");
MODULE_ALIAS_NFT_AF_EXPR(5, "fib");
+MODULE_DESCRIPTION("nftables netdev fib lookups support");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b70b48996801..b8f76c9057fd 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -8,7 +8,8 @@
#include <linux/spinlock.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/ip.h> /* for ipv4 options. */
+#include <net/ip.h>
+#include <net/flow.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -19,37 +20,6 @@ struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
-static int nft_flow_route(const struct nft_pktinfo *pkt,
- const struct nf_conn *ct,
- struct nf_flow_route *route,
- enum ip_conntrack_dir dir)
-{
- struct dst_entry *this_dst = skb_dst(pkt->skb);
- struct dst_entry *other_dst = NULL;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
- fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
- break;
- case NFPROTO_IPV6:
- fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
- fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
- break;
- }
-
- nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
- if (!other_dst)
- return -ENOENT;
-
- route->tuple[dir].dst = this_dst;
- route->tuple[!dir].dst = other_dst;
-
- return 0;
-}
-
static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
{
if (skb_sec_path(skb))
@@ -67,6 +37,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
return false;
}
+static void flow_offload_ct_tcp(struct nf_conn *ct)
+{
+ /* conntrack will not see all packets, disable tcp window validation. */
+ spin_lock_bh(&ct->lock);
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ spin_unlock_bh(&ct->lock);
+}
+
static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -74,8 +53,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
struct nft_flow_offload *priv = nft_expr_priv(expr);
struct nf_flowtable *flowtable = &priv->flowtable->data;
struct tcphdr _tcph, *tcph = NULL;
+ struct nf_flow_route route = {};
enum ip_conntrack_info ctinfo;
- struct nf_flow_route route;
struct flow_offload *flow;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
@@ -90,19 +69,33 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
case IPPROTO_TCP:
- tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff,
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
sizeof(_tcph), &_tcph);
- if (unlikely(!tcph || tcph->fin || tcph->rst))
+ if (unlikely(!tcph || tcph->fin || tcph->rst ||
+ !nf_conntrack_tcp_established(ct)))
goto out;
break;
case IPPROTO_UDP:
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+ if (ct->status & IPS_NAT_MASK)
+ goto out;
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+ goto out;
+ break;
+ }
+#endif
default:
goto out;
}
if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
- ct->status & IPS_SEQ_ADJUST)
+ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
goto out;
if (!nf_ct_is_confirmed(ct))
@@ -112,31 +105,28 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
goto out;
dir = CTINFO2DIR(ctinfo);
- if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
goto err_flow_route;
flow = flow_offload_alloc(ct);
if (!flow)
goto err_flow_alloc;
- if (flow_offload_route_init(flow, &route) < 0)
- goto err_flow_add;
-
- if (tcph) {
- ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- }
+ flow_offload_route_init(flow, &route);
+ if (tcph)
+ flow_offload_ct_tcp(ct);
+ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
ret = flow_offload_add(flowtable, flow);
if (ret < 0)
goto err_flow_add;
- dst_release(route.tuple[!dir].dst);
return;
err_flow_add:
flow_offload_free(flow);
err_flow_alloc:
+ dst_release(route.tuple[dir].dst);
dst_release(route.tuple[!dir].dst);
err_flow_route:
clear_bit(IPS_OFFLOAD_BIT, &ct->status);
@@ -145,11 +135,15 @@ out:
}
static int nft_flow_offload_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
unsigned int hook_mask = (1 << NF_INET_FORWARD);
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, hook_mask);
}
@@ -169,13 +163,15 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
if (!tb[NFTA_FLOW_TABLE_NAME])
return -EINVAL;
- flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(ctx->net, ctx->table,
+ tb[NFTA_FLOW_TABLE_NAME], genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
+ if (!nft_use_inc(&flowtable->use))
+ return -EMFILE;
+
priv->flowtable = flowtable;
- flowtable->use++;
return nf_ct_netns_get(ctx->net, ctx->family);
}
@@ -194,7 +190,7 @@ static void nft_flow_offload_activate(const struct nft_ctx *ctx,
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
- priv->flowtable->use++;
+ nft_use_inc_restore(&priv->flowtable->use);
}
static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
@@ -203,7 +199,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_flow_offload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
@@ -227,6 +224,7 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
.destroy = nft_flow_offload_destroy,
.validate = nft_flow_offload_validate,
.dump = nft_flow_offload_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_flow_offload_type __read_mostly = {
@@ -286,3 +284,4 @@ module_exit(nft_flow_offload_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("flow_offload");
+MODULE_DESCRIPTION("nftables hardware flow offload module");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 3087e23297db..152a9fb4d23a 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -18,7 +18,7 @@
#include <net/ip.h>
struct nft_fwd_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_fwd_netdev_eval(const struct nft_expr *expr,
@@ -27,9 +27,11 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ struct sk_buff *skb = pkt->skb;
/* This is used by ifb only. */
- skb_set_redirected(pkt->skb, true);
+ skb->skb_iif = skb->dev->ifindex;
+ skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS);
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
@@ -38,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
[NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
[NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
- [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
+ [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -50,11 +52,12 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_FWD_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
-static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -77,9 +80,14 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx,
return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif);
}
+static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
struct nft_fwd_neigh {
- enum nft_registers sreg_dev:8;
- enum nft_registers sreg_addr:8;
+ u8 sreg_dev;
+ u8 sreg_addr;
u8 nfproto;
};
@@ -138,6 +146,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
return;
skb->dev = dev;
+ skb_clear_tstamp(skb);
neigh_xmit(neigh_table, dev, addr, skb);
out:
regs->verdict.code = verdict;
@@ -156,8 +165,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
!tb[NFTA_FWD_NFPROTO])
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
switch (priv->nfproto) {
@@ -171,14 +178,17 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
if (err < 0)
return err;
- return nft_validate_register_load(priv->sreg_addr, addr_len);
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr,
+ addr_len);
}
-static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_neigh_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
@@ -194,10 +204,10 @@ nla_put_failure:
}
static int nft_fwd_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS));
}
static struct nft_expr_type nft_fwd_netdev_type;
@@ -208,6 +218,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
.validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -217,7 +228,9 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
.validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
.offload = nft_fwd_netdev_offload,
+ .offload_action = nft_fwd_netdev_offload_action,
};
static const struct nft_expr_ops *
@@ -256,4 +269,5 @@ module_exit(nft_fwd_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nftables netdev packet forwarding support");
MODULE_ALIAS_NFT_AF_EXPR(5, "fwd");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index b836d550b919..5d034bbb6913 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -14,8 +14,8 @@
#include <linux/jhash.h>
struct nft_jhash {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
u8 len;
bool autogen_seed:1;
u32 modulus;
@@ -38,7 +38,7 @@ static void nft_jhash_eval(const struct nft_expr *expr,
}
struct nft_symhash {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
};
@@ -51,7 +51,8 @@ static void nft_symhash_eval(const struct nft_expr *expr,
struct sk_buff *skb = pkt->skb;
u32 h;
- h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus);
+ h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb),
+ priv->modulus);
regs->data[priv->dreg] = h + priv->offset;
}
@@ -59,7 +60,7 @@ static void nft_symhash_eval(const struct nft_expr *expr,
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG] = { .type = NLA_U32 },
[NFTA_HASH_DREG] = { .type = NLA_U32 },
- [NFTA_HASH_LEN] = { .type = NLA_U32 },
+ [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_HASH_MODULUS] = { .type = NLA_U32 },
[NFTA_HASH_SEED] = { .type = NLA_U32 },
[NFTA_HASH_OFFSET] = { .type = NLA_U32 },
@@ -83,9 +84,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
if (err < 0)
return err;
@@ -94,6 +92,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len);
+ if (err < 0)
+ return err;
+
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
if (priv->modulus < 1)
return -ERANGE;
@@ -108,9 +110,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
get_random_bytes(&priv->seed, sizeof(priv->seed));
}
- return nft_validate_register_load(priv->sreg, len) &&
- nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
static int nft_symhash_init(const struct nft_ctx *ctx,
@@ -126,8 +127,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
if (priv->modulus < 1)
return -ERANGE;
@@ -135,12 +134,13 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ sizeof(u32));
}
static int nft_jhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_jhash *priv = nft_expr_priv(expr);
@@ -166,8 +166,18 @@ nla_put_failure:
return -1;
}
+static bool nft_jhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_jhash *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, sizeof(u32));
+
+ return false;
+}
+
static int nft_symhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_symhash *priv = nft_expr_priv(expr);
@@ -186,6 +196,30 @@ nla_put_failure:
return -1;
}
+static bool nft_symhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct nft_symhash *symhash;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ symhash = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->offset != symhash->offset ||
+ priv->modulus != symhash->modulus) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_hash_type;
static const struct nft_expr_ops nft_jhash_ops = {
.type = &nft_hash_type,
@@ -193,6 +227,7 @@ static const struct nft_expr_ops nft_jhash_ops = {
.eval = nft_jhash_eval,
.init = nft_jhash_init,
.dump = nft_jhash_dump,
+ .reduce = nft_jhash_reduce,
};
static const struct nft_expr_ops nft_symhash_ops = {
@@ -201,6 +236,7 @@ static const struct nft_expr_ops nft_symhash_ops = {
.eval = nft_symhash_eval,
.init = nft_symhash_init,
.dump = nft_symhash_dump,
+ .reduce = nft_symhash_reduce,
};
static const struct nft_expr_ops *
@@ -248,3 +284,4 @@ module_exit(nft_hash_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("hash");
+MODULE_DESCRIPTION("Netfilter nftables hash module");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index c7f0ef73d939..02ee5fb69871 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -29,31 +29,62 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
[NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
};
+static enum nft_data_types nft_reg_to_type(const struct nlattr *nla)
+{
+ enum nft_data_types type;
+ u8 reg;
+
+ reg = ntohl(nla_get_be32(nla));
+ if (reg == NFT_REG_VERDICT)
+ type = NFT_DATA_VERDICT;
+ else
+ type = NFT_DATA_VALUE;
+
+ return type;
+}
+
static int nft_immediate_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_immediate_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .size = sizeof(priv->data),
+ };
int err;
if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
tb[NFTA_IMMEDIATE_DATA] == NULL)
return -EINVAL;
- err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_IMMEDIATE_DATA]);
+ desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]);
+ err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
priv->dlen = desc.len;
- priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, &priv->data,
- desc.type, desc.len);
+ err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG],
+ &priv->dreg, &priv->data, desc.type,
+ desc.len);
if (err < 0)
goto err1;
+ if (priv->dreg == NFT_REG_VERDICT) {
+ struct nft_chain *chain = priv->data.verdict.chain;
+
+ switch (priv->data.verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nf_tables_bind_chain(ctx, chain);
+ if (err < 0)
+ goto err1;
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
err1:
@@ -65,15 +96,86 @@ static void nft_immediate_activate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_activate(&chain_ctx, rule);
+
+ nft_clear(ctx->net, chain);
+ break;
+ default:
+ break;
+ }
+ }
return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg));
}
+static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx,
+ struct nft_chain *chain,
+ enum nft_trans_phase phase)
+{
+ struct nft_ctx chain_ctx;
+ struct nft_rule *rule;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_deactivate(&chain_ctx, rule, phase);
+}
+
static void nft_immediate_deactivate(const struct nft_ctx *ctx,
const struct nft_expr *expr,
enum nft_trans_phase phase)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_chain *chain;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nf_tables_unbind_chain(ctx, chain);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ case NFT_TRANS_PREPARE:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ default:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_chain_del(chain);
+ chain->bound = false;
+ nft_use_dec(&chain->table->use);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
if (phase == NFT_TRANS_COMMIT)
return;
@@ -81,7 +183,53 @@ static void nft_immediate_deactivate(const struct nft_ctx *ctx,
return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_rule *rule, *n;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ return;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+
+ if (!nft_chain_binding(chain))
+ break;
+
+ /* Rule construction failed, but chain is already bound:
+ * let the transaction records release this chain and its rules.
+ */
+ if (chain->bound) {
+ nft_use_dec(&chain->use);
+ break;
+ }
+
+ /* Rule has been deleted, release chain and its rules. */
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ nft_use_dec(&chain->use);
+ list_for_each_entry_safe(rule, n, &chain->rules, list) {
+ nft_use_dec(&chain->use);
+ list_del(&rule->list);
+ nf_tables_rule_destroy(&chain_ctx, rule);
+ }
+ nf_tables_chain_destroy(chain);
+ break;
+ default:
+ break;
+ }
+}
+
+static int nft_immediate_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
@@ -96,8 +244,7 @@ nla_put_failure:
}
static int nft_immediate_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
struct nft_ctx *pctx = (struct nft_ctx *)ctx;
@@ -163,6 +310,27 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx,
return 0;
}
+static bool nft_immediate_offload_action(const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return true;
+
+ return false;
+}
+
+static bool nft_immediate_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ nft_reg_track_cancel(track, priv->dreg, priv->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -170,10 +338,12 @@ static const struct nft_expr_ops nft_imm_ops = {
.init = nft_immediate_init,
.activate = nft_immediate_activate,
.deactivate = nft_immediate_deactivate,
+ .destroy = nft_immediate_destroy,
.dump = nft_immediate_dump,
.validate = nft_immediate_validate,
+ .reduce = nft_immediate_reduce,
.offload = nft_immediate_offload,
- .offload_flags = NFT_OFFLOAD_F_ACTION,
+ .offload_action = nft_immediate_offload_action,
};
struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
new file mode 100644
index 000000000000..c4569d4b9228
--- /dev/null
+++ b/net/netfilter/nft_inner.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/gre.h>
+#include <net/geneve.h>
+#include <net/ip.h>
+#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+struct nft_inner_tun_ctx_locked {
+ struct nft_inner_tun_ctx ctx;
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+/* Same layout as nft_expr but it embeds the private expression data area. */
+struct __nft_expr {
+ const struct nft_expr_ops *ops;
+ union {
+ struct nft_payload payload;
+ struct nft_meta meta;
+ } __attribute__((aligned(__alignof__(u64))));
+};
+
+enum {
+ NFT_INNER_EXPR_PAYLOAD,
+ NFT_INNER_EXPR_META,
+};
+
+struct nft_inner {
+ u8 flags;
+ u8 hdrsize;
+ u8 type;
+ u8 expr_type;
+
+ struct __nft_expr expr;
+};
+
+static int nft_inner_parse_l2l3(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 off)
+{
+ __be16 llproto, outer_llproto;
+ u32 nhoff, thoff;
+
+ if (priv->flags & NFT_INNER_LL) {
+ struct vlan_ethhdr *veth, _veth;
+ struct ethhdr *eth, _eth;
+ u32 hdrsize;
+
+ eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth);
+ if (!eth)
+ return -1;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ llproto = eth->h_proto;
+ hdrsize = sizeof(_eth);
+ break;
+ case htons(ETH_P_8021Q):
+ veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth);
+ if (!veth)
+ return -1;
+
+ outer_llproto = veth->h_vlan_encapsulated_proto;
+ llproto = veth->h_vlan_proto;
+ hdrsize = sizeof(_veth);
+ break;
+ default:
+ return -1;
+ }
+
+ ctx->inner_lloff = off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL;
+ off += hdrsize;
+ } else {
+ struct iphdr *iph;
+ u32 _version;
+
+ iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version);
+ if (!iph)
+ return -1;
+
+ switch (iph->version) {
+ case 4:
+ llproto = htons(ETH_P_IP);
+ break;
+ case 6:
+ llproto = htons(ETH_P_IPV6);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ ctx->llproto = llproto;
+ if (llproto == htons(ETH_P_8021Q))
+ llproto = outer_llproto;
+
+ nhoff = off;
+
+ switch (llproto) {
+ case htons(ETH_P_IP): {
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ if (iph->ihl < 5 || iph->version != 4)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff + (iph->ihl * 4);
+ if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) {
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = iph->protocol;
+ }
+ }
+ break;
+ case htons(ETH_P_IPV6): {
+ struct ipv6hdr *ip6h, _ip6h;
+ int fh_flags = IP6_FH_F_AUTH;
+ unsigned short fragoff;
+ int l4proto;
+
+ ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h);
+ if (!ip6h)
+ return -1;
+
+ if (ip6h->version != 6)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff;
+ l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags);
+ if (l4proto < 0 || thoff > U16_MAX)
+ return -1;
+
+ if (fragoff == 0) {
+ thoff = nhoff + sizeof(_ip6h);
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = l4proto;
+ }
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 *off)
+{
+ if (pkt->tprot == IPPROTO_GRE) {
+ ctx->inner_tunoff = pkt->thoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ return 0;
+ }
+
+ if (pkt->tprot != IPPROTO_UDP)
+ return -1;
+
+ ctx->inner_tunoff = *off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ *off += priv->hdrsize;
+
+ switch (priv->type) {
+ case NFT_INNER_GENEVE: {
+ struct genevehdr *gnvh, _gnvh;
+
+ gnvh = skb_header_pointer(pkt->skb, pkt->inneroff,
+ sizeof(_gnvh), &_gnvh);
+ if (!gnvh)
+ return -1;
+
+ *off += gnvh->opt_len * 4;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse(const struct nft_inner *priv,
+ struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ u32 off = pkt->inneroff;
+
+ if (priv->flags & NFT_INNER_HDRSIZE &&
+ nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
+ return -1;
+
+ if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
+ if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
+ return -1;
+ } else if (priv->flags & NFT_INNER_TH) {
+ tun_ctx->inner_thoff = off;
+ tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ }
+
+ tun_ctx->type = priv->type;
+ tun_ctx->cookie = (unsigned long)pkt->skb;
+ pkt->flags |= NFT_PKTINFO_INNER_FULL;
+
+ return 0;
+}
+
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
+ local_bh_enable();
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ return false;
+ }
+ *tun_ctx = *this_cpu_tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+
+ return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+ const struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
+ *this_cpu_tun_ctx = *tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+}
+
+static bool nft_inner_parse_needed(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
+ return true;
+
+ if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+ return true;
+
+ if (priv->type != tun_ctx->type)
+ return true;
+
+ return false;
+}
+
+static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+ struct nft_inner_tun_ctx tun_ctx = {};
+
+ if (nft_payload_inner_offset(pkt) < 0)
+ goto err;
+
+ if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+ nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
+ goto err;
+
+ switch (priv->expr_type) {
+ case NFT_INNER_EXPR_PAYLOAD:
+ nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+ break;
+ case NFT_INNER_EXPR_META:
+ nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = {
+ [NFTA_INNER_NUM] = { .type = NLA_U32 },
+ [NFTA_INNER_FLAGS] = { .type = NLA_U32 },
+ [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 },
+ [NFTA_INNER_TYPE] = { .type = NLA_U32 },
+ [NFTA_INNER_EXPR] = { .type = NLA_NESTED },
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nft_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_inner *priv = nft_expr_priv(expr);
+ u32 flags, hdrsize, type, num;
+ struct nft_expr_info expr_info;
+ int err;
+
+ if (!tb[NFTA_INNER_FLAGS] ||
+ !tb[NFTA_INNER_NUM] ||
+ !tb[NFTA_INNER_HDRSIZE] ||
+ !tb[NFTA_INNER_TYPE] ||
+ !tb[NFTA_INNER_EXPR])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS]));
+ if (flags & ~NFT_INNER_MASK)
+ return -EOPNOTSUPP;
+
+ num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM]));
+ if (num != 0)
+ return -EOPNOTSUPP;
+
+ hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE]));
+ type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE]));
+
+ if (type > U8_MAX)
+ return -EINVAL;
+
+ if (flags & NFT_INNER_HDRSIZE) {
+ if (hdrsize == 0 || hdrsize > 64)
+ return -EOPNOTSUPP;
+ }
+
+ priv->flags = flags;
+ priv->hdrsize = hdrsize;
+ priv->type = type;
+
+ err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info);
+ if (err < 0)
+ return err;
+
+ priv->expr.ops = expr_info.ops;
+
+ if (!strcmp(expr_info.ops->type->name, "payload"))
+ priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+ else if (!strcmp(expr_info.ops->type->name, "meta"))
+ priv->expr_type = NFT_INNER_EXPR_META;
+ else
+ return -EINVAL;
+
+ err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr,
+ (const struct nlattr * const*)expr_info.tb);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_inner_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) ||
+ nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) ||
+ nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) ||
+ nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize)))
+ goto nla_put_failure;
+
+ if (nft_expr_dump(skb, NFTA_INNER_EXPR,
+ (struct nft_expr *)&priv->expr, reset) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_inner_ops = {
+ .type = &nft_inner_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)),
+ .eval = nft_inner_eval,
+ .init = nft_inner_init,
+ .dump = nft_inner_dump,
+};
+
+struct nft_expr_type nft_inner_type __read_mostly = {
+ .name = "inner",
+ .ops = &nft_inner_ops,
+ .policy = nft_inner_policy,
+ .maxattr = NFTA_INNER_MAX,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
new file mode 100644
index 000000000000..de1b6066bfa8
--- /dev/null
+++ b/net/netfilter/nft_last.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_last {
+ unsigned long jiffies;
+ unsigned int set;
+};
+
+struct nft_last_priv {
+ struct nft_last *last;
+};
+
+static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = {
+ [NFTA_LAST_SET] = { .type = NLA_U32 },
+ [NFTA_LAST_MSECS] = { .type = NLA_U64 },
+};
+
+static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last;
+ u64 last_jiffies;
+ int err;
+
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
+ if (!last)
+ return -ENOMEM;
+
+ if (tb[NFTA_LAST_SET])
+ last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+
+ if (last->set && tb[NFTA_LAST_MSECS]) {
+ err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
+ if (err < 0)
+ goto err;
+
+ last->jiffies = jiffies - (unsigned long)last_jiffies;
+ }
+ priv->last = last;
+
+ return 0;
+err:
+ kfree(last);
+
+ return err;
+}
+
+static void nft_last_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+
+ if (READ_ONCE(last->jiffies) != jiffies)
+ WRITE_ONCE(last->jiffies, jiffies);
+ if (READ_ONCE(last->set) == 0)
+ WRITE_ONCE(last->set, 1);
+}
+
+static int nft_last_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+ unsigned long last_jiffies = READ_ONCE(last->jiffies);
+ u32 last_set = READ_ONCE(last->set);
+ __be64 msecs;
+
+ if (time_before(jiffies, last_jiffies)) {
+ WRITE_ONCE(last->set, 0);
+ last_set = 0;
+ }
+
+ if (last_set)
+ msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies);
+ else
+ msecs = 0;
+
+ if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) ||
+ nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_last_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+
+ kfree(priv->last);
+}
+
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
+{
+ struct nft_last_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_last_priv *priv_src = nft_expr_priv(src);
+
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp);
+ if (!priv_dst->last)
+ return -ENOMEM;
+
+ priv_dst->last->set = priv_src->last->set;
+ priv_dst->last->jiffies = priv_src->last->jiffies;
+
+ return 0;
+}
+
+static const struct nft_expr_ops nft_last_ops = {
+ .type = &nft_last_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)),
+ .eval = nft_last_eval,
+ .init = nft_last_init,
+ .destroy = nft_last_destroy,
+ .clone = nft_last_clone,
+ .dump = nft_last_dump,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+struct nft_expr_type nft_last_type __read_mostly = {
+ .name = "last",
+ .ops = &nft_last_ops,
+ .policy = nft_last_policy,
+ .maxattr = NFTA_LAST_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 35b67d7e3694..21d26b79b460 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -18,6 +18,10 @@ struct nft_limit {
spinlock_t lock;
u64 last;
u64 tokens;
+};
+
+struct nft_limit_priv {
+ struct nft_limit *limit;
u64 tokens_max;
u64 rate;
u64 nsecs;
@@ -25,93 +29,111 @@ struct nft_limit {
bool invert;
};
-static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
+static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
{
u64 now, tokens;
s64 delta;
- spin_lock_bh(&limit->lock);
+ spin_lock_bh(&priv->limit->lock);
now = ktime_get_ns();
- tokens = limit->tokens + now - limit->last;
- if (tokens > limit->tokens_max)
- tokens = limit->tokens_max;
+ tokens = priv->limit->tokens + now - priv->limit->last;
+ if (tokens > priv->tokens_max)
+ tokens = priv->tokens_max;
- limit->last = now;
+ priv->limit->last = now;
delta = tokens - cost;
if (delta >= 0) {
- limit->tokens = delta;
- spin_unlock_bh(&limit->lock);
- return limit->invert;
+ priv->limit->tokens = delta;
+ spin_unlock_bh(&priv->limit->lock);
+ return priv->invert;
}
- limit->tokens = tokens;
- spin_unlock_bh(&limit->lock);
- return !limit->invert;
+ priv->limit->tokens = tokens;
+ spin_unlock_bh(&priv->limit->lock);
+ return !priv->invert;
}
/* Use same default as in iptables. */
#define NFT_LIMIT_PKT_BURST_DEFAULT 5
-static int nft_limit_init(struct nft_limit *limit,
+static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
- u64 unit, tokens;
+ u64 unit, tokens, rate_with_burst;
+ bool invert = false;
if (tb[NFTA_LIMIT_RATE] == NULL ||
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ if (priv->rate == 0)
+ return -EINVAL;
+
unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- limit->nsecs = unit * NSEC_PER_SEC;
- if (limit->rate == 0 || limit->nsecs < unit)
+ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
return -EOVERFLOW;
if (tb[NFTA_LIMIT_BURST])
- limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+ priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
- if (pkts && limit->burst == 0)
- limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+ if (pkts && priv->burst == 0)
+ priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
- if (limit->rate + limit->burst < limit->rate)
+ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
return -EOVERFLOW;
if (pkts) {
- tokens = div_u64(limit->nsecs, limit->rate) * limit->burst;
+ u64 tmp = div64_u64(priv->nsecs, priv->rate);
+
+ if (check_mul_overflow(tmp, priv->burst, &tokens))
+ return -EOVERFLOW;
} else {
+ u64 tmp;
+
/* The token bucket size limits the number of tokens can be
* accumulated. tokens_max specifies the bucket size.
* tokens_max = unit * (rate + burst) / rate.
*/
- tokens = div_u64(limit->nsecs * (limit->rate + limit->burst),
- limit->rate);
- }
+ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
+ return -EOVERFLOW;
- limit->tokens = tokens;
- limit->tokens_max = limit->tokens;
+ tokens = div64_u64(tmp, priv->rate);
+ }
if (tb[NFTA_LIMIT_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
+ if (flags & ~NFT_LIMIT_F_INV)
+ return -EOPNOTSUPP;
+
if (flags & NFT_LIMIT_F_INV)
- limit->invert = true;
+ invert = true;
}
- limit->last = ktime_get_ns();
- spin_lock_init(&limit->lock);
+
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
+ if (!priv->limit)
+ return -ENOMEM;
+
+ priv->limit->tokens = tokens;
+ priv->tokens_max = priv->limit->tokens;
+ priv->invert = invert;
+ priv->limit->last = ktime_get_ns();
+ spin_lock_init(&priv->limit->lock);
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv,
enum nft_limit_type type)
{
- u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0;
- u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0;
+ u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC);
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate),
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate),
NFTA_LIMIT_PAD) ||
nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
NFTA_LIMIT_PAD) ||
- nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
goto nla_put_failure;
@@ -121,8 +143,34 @@ nla_put_failure:
return -1;
}
-struct nft_limit_pkts {
- struct nft_limit limit;
+static void nft_limit_destroy(const struct nft_ctx *ctx,
+ const struct nft_limit_priv *priv)
+{
+ kfree(priv->limit);
+}
+
+static int nft_limit_clone(struct nft_limit_priv *priv_dst,
+ const struct nft_limit_priv *priv_src, gfp_t gfp)
+{
+ priv_dst->tokens_max = priv_src->tokens_max;
+ priv_dst->rate = priv_src->rate;
+ priv_dst->nsecs = priv_src->nsecs;
+ priv_dst->burst = priv_src->burst;
+ priv_dst->invert = priv_src->invert;
+
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp);
+ if (!priv_dst->limit)
+ return -ENOMEM;
+
+ spin_lock_init(&priv_dst->limit->lock);
+ priv_dst->limit->tokens = priv_src->tokens_max;
+ priv_dst->limit->last = ktime_get_ns();
+
+ return 0;
+}
+
+struct nft_limit_priv_pkts {
+ struct nft_limit_priv limit;
u64 cost;
};
@@ -130,7 +178,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -148,7 +196,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -159,27 +207,50 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_pkts_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
- const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
+{
+ struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
+
+ priv_dst->cost = priv_src->cost;
+
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp);
+}
+
static struct nft_expr_type nft_limit_type;
static const struct nft_expr_ops nft_limit_pkts_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.eval = nft_limit_pkts_eval,
.init = nft_limit_pkts_init,
+ .destroy = nft_limit_pkts_destroy,
+ .clone = nft_limit_pkts_clone,
.dump = nft_limit_pkts_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static void nft_limit_bytes_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -190,25 +261,45 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_init(priv, tb, false);
}
static int nft_limit_bytes_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, priv);
+}
+
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
+{
+ struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv *priv_src = nft_expr_priv(src);
+
+ return nft_limit_clone(priv_dst, priv_src, gfp);
+}
+
static const struct nft_expr_ops nft_limit_bytes_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)),
.eval = nft_limit_bytes_eval,
.init = nft_limit_bytes_init,
.dump = nft_limit_bytes_dump,
+ .clone = nft_limit_bytes_clone,
+ .destroy = nft_limit_bytes_destroy,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -240,7 +331,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -250,7 +341,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -265,16 +356,25 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit_pkts *priv = nft_obj_data(obj);
+ const struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_pkts_ops = {
.type = &nft_limit_obj_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.init = nft_limit_obj_pkts_init,
+ .destroy = nft_limit_obj_pkts_destroy,
.eval = nft_limit_obj_pkts_eval,
.dump = nft_limit_obj_pkts_dump,
};
@@ -283,7 +383,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -294,7 +394,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_init(priv, tb, false);
}
@@ -303,16 +403,25 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit *priv = nft_obj_data(obj);
+ const struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_bytes_ops = {
.type = &nft_limit_obj_type,
- .size = sizeof(struct nft_limit),
+ .size = sizeof(struct nft_limit_priv),
.init = nft_limit_obj_bytes_init,
+ .destroy = nft_limit_obj_bytes_destroy,
.eval = nft_limit_obj_bytes_eval,
.dump = nft_limit_obj_bytes_dump,
};
@@ -372,3 +481,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("limit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT);
+MODULE_DESCRIPTION("nftables limit expression support");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index fe4831f2258f..e35588137995 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -128,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_FLAGS] = { .type = NLA_U32 },
};
+static int nft_log_modprobe(struct net *net, enum nf_log_type t)
+{
+ switch (t) {
+ case NF_LOG_TYPE_LOG:
+ return nft_request_module(net, "%s", "nf_log_syslog");
+ case NF_LOG_TYPE_ULOG:
+ return nft_request_module(net, "%s", "nfnetlink_log");
+ case NF_LOG_TYPE_MAX:
+ break;
+ }
+
+ return -ENOENT;
+}
+
static int nft_log_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -149,10 +163,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
- priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT);
if (priv->prefix == NULL)
return -ENOMEM;
- nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
} else {
priv->prefix = (char *)nft_log_null_prefix;
}
@@ -197,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx,
return 0;
err = nf_logger_find_get(ctx->family, li->type);
- if (err < 0)
+ if (err < 0) {
+ if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN)
+ err = -EAGAIN;
+
goto err1;
+ }
return 0;
@@ -223,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
nf_logger_put(ctx->family, li->type);
}
-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_log *priv = nft_expr_priv(expr);
const struct nf_loginfo *li = &priv->loginfo;
@@ -272,6 +291,7 @@ static const struct nft_expr_ops nft_log_ops = {
.init = nft_log_init,
.destroy = nft_log_destroy,
.dump = nft_log_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_log_type __read_mostly = {
@@ -298,3 +318,4 @@ module_exit(nft_log_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("log");
+MODULE_DESCRIPTION("Netfilter nf_tables log module");
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index f1363b8aabba..fc2d7c5d83c8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -17,30 +17,103 @@
struct nft_lookup {
struct nft_set *set;
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
+ bool dreg_set;
bool invert;
struct nft_set_binding binding;
};
+static const struct nft_set_ext *
+__nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ if (set->ops == &nft_set_hash_fast_type.ops)
+ return nft_hash_lookup_fast(net, set, key);
+ if (set->ops == &nft_set_hash_type.ops)
+ return nft_hash_lookup(net, set, key);
+
+ if (set->ops == &nft_set_rhash_type.ops)
+ return nft_rhash_lookup(net, set, key);
+
+ if (set->ops == &nft_set_bitmap_type.ops)
+ return nft_bitmap_lookup(net, set, key);
+
+ if (set->ops == &nft_set_pipapo_type.ops)
+ return nft_pipapo_lookup(net, set, key);
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (set->ops == &nft_set_pipapo_avx2_type.ops)
+ return nft_pipapo_avx2_lookup(net, set, key);
+#endif
+
+ if (set->ops == &nft_set_rbtree_type.ops)
+ return nft_rbtree_lookup(net, set, key);
+
+ WARN_ON_ONCE(1);
+#endif
+ return set->ops->lookup(net, set, key);
+}
+
+static unsigned int nft_base_seq(const struct net *net)
+{
+ /* pairs with smp_store_release() in nf_tables_commit() */
+ return smp_load_acquire(&net->nft.base_seq);
+}
+
+static bool nft_lookup_should_retry(const struct net *net, unsigned int seq)
+{
+ return unlikely(seq != nft_base_seq(net));
+}
+
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ const struct nft_set_ext *ext;
+ unsigned int base_seq;
+
+ do {
+ base_seq = nft_base_seq(net);
+
+ ext = __nft_set_do_lookup(net, set, key);
+ if (ext)
+ break;
+ /* No match? There is a small chance that lookup was
+ * performed in the old generation, but nf_tables_commit()
+ * already unlinked a (matching) element.
+ *
+ * We need to repeat the lookup to make sure that we didn't
+ * miss a matching element in the new generation.
+ */
+ } while (nft_lookup_should_retry(net, base_seq));
+
+ return ext;
+}
+EXPORT_SYMBOL_GPL(nft_set_do_lookup);
+
void nft_lookup_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
- const struct nft_set_ext *ext = NULL;
+ const struct net *net = nft_net(pkt);
+ const struct nft_set_ext *ext;
bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext) ^ priv->invert;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ found = !!ext ^ priv->invert;
if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
if (ext) {
- if (set->flags & NFT_SET_MAP)
+ if (priv->dreg_set)
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
@@ -54,7 +127,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
- [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV),
};
static int nft_lookup_init(const struct nft_ctx *ctx,
@@ -76,22 +150,16 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
- priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
if (tb[NFTA_LOOKUP_FLAGS]) {
flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS]));
- if (flags & ~NFT_LOOKUP_F_INV)
- return -EINVAL;
-
- if (flags & NFT_LOOKUP_F_INV) {
- if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ if (flags & NFT_LOOKUP_F_INV)
priv->invert = true;
- }
}
if (tb[NFTA_LOOKUP_DREG] != NULL) {
@@ -100,13 +168,23 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_MAP))
return -EINVAL;
- priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- set->dtype, set->dlen);
+ err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG],
+ &priv->dreg, NULL,
+ nft_set_datatype(set),
+ set->dlen);
if (err < 0)
return err;
- } else if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ priv->dreg_set = true;
+ } else if (set->flags & NFT_SET_MAP) {
+ /* Map given, but user asks for lookup only (i.e. to
+ * ignore value assoicated with key).
+ *
+ * This makes no sense for anonymous maps since they are
+ * scoped to the rule, but for named sets this can be useful.
+ */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ return -EINVAL;
+ }
priv->binding.flags = set->flags & NFT_SET_MAP;
@@ -132,7 +210,7 @@ static void nft_lookup_activate(const struct nft_ctx *ctx,
{
struct nft_lookup *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -143,7 +221,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
@@ -152,7 +231,7 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg))
goto nla_put_failure;
- if (priv->set->flags & NFT_SET_MAP)
+ if (priv->dreg_set)
if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags)))
@@ -163,61 +242,41 @@ nla_put_failure:
return -1;
}
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- struct nft_ctx *pctx = (struct nft_ctx *)ctx;
- const struct nft_data *data;
- int err;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- pctx->level++;
- err = nft_chain_validate(ctx, data->verdict.chain);
- if (err < 0)
- return err;
- pctx->level--;
- break;
- default:
- break;
- }
-
- return 0;
-}
-
static int nft_lookup_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
- struct nft_set_iter iter;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_setelem_validate,
+ };
if (!(priv->set->flags & NFT_SET_MAP) ||
priv->set->dtype != NFT_DATA_VERDICT)
return 0;
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nft_lookup_validate_setelem;
-
priv->set->ops->walk(ctx, priv->set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_validate(ctx, priv->set);
+
if (iter.err < 0)
return iter.err;
return 0;
}
+static bool nft_lookup_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (priv->set->flags & NFT_SET_MAP)
+ nft_reg_track_cancel(track, priv->dreg, priv->set->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -228,6 +287,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
.validate = nft_lookup_validate,
+ .reduce = nft_lookup_reduce,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index bc9fd98c5d6d..868bd4d73555 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -15,19 +15,19 @@
struct nft_masq {
u32 flags;
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
};
static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
- [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
[NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 },
};
static int nft_masq_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
int err;
@@ -43,30 +43,23 @@ static int nft_masq_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- u32 plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ u32 plen = sizeof_field(struct nf_nat_range, min_proto.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
- if (tb[NFTA_MASQ_FLAGS]) {
+ if (tb[NFTA_MASQ_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
@@ -77,7 +70,8 @@ static int nft_masq_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_masq_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_masq *priv = nft_expr_priv(expr);
@@ -99,23 +93,39 @@ nla_put_failure:
return -1;
}
-static void nft_masq_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_masq_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_masq *priv = nft_expr_priv(expr);
+ const struct nft_masq *priv = nft_expr_priv(expr);
struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+ }
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb,
+ nft_hook(pkt),
+ &range,
+ nft_out(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
+ nft_out(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
- &range, nft_out(pkt));
}
static void
@@ -128,11 +138,12 @@ static struct nft_expr_type nft_masq_ipv4_type;
static const struct nft_expr_ops nft_masq_ipv4_ops = {
.type = &nft_masq_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv4_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_ipv4_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
@@ -145,25 +156,6 @@ static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
};
#ifdef CONFIG_NF_TABLES_IPV6
-static void nft_masq_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- range.flags = priv->flags;
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- }
- regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
- nft_out(pkt));
-}
-
static void
nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -174,11 +166,12 @@ static struct nft_expr_type nft_masq_ipv6_type;
static const struct nft_expr_ops nft_masq_ipv6_ops = {
.type = &nft_masq_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv6_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_ipv6_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
@@ -205,20 +198,6 @@ static inline void nft_masq_module_exit_ipv6(void) {}
#endif
#ifdef CONFIG_NF_TABLES_INET
-static void nft_masq_inet_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- return nft_masq_ipv4_eval(expr, regs, pkt);
- case NFPROTO_IPV6:
- return nft_masq_ipv6_eval(expr, regs, pkt);
- }
-
- WARN_ON_ONCE(1);
-}
-
static void
nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -229,11 +208,12 @@ static struct nft_expr_type nft_masq_inet_type;
static const struct nft_expr_ops nft_masq_inet_ops = {
.type = &nft_masq_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_inet_eval,
+ .eval = nft_masq_eval,
.init = nft_masq_init,
.destroy = nft_masq_inet_destroy,
.dump = nft_masq_dump,
.validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_masq_inet_type __read_mostly = {
@@ -305,3 +285,4 @@ module_exit(nft_masq_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_EXPR("masq");
+MODULE_DESCRIPTION("Netfilter nftables masquerade expression support");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 951b6e87ed5d..05cd1e6e6a2f 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -14,6 +14,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/random.h>
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
@@ -32,8 +33,6 @@
#define NFT_META_SECS_PER_DAY 86400
#define NFT_META_DAYS_PER_WEEK 7
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
-
static u8 nft_meta_weekday(void)
{
time64_t secs = ktime_get_real_seconds();
@@ -64,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key,
{
switch (key) {
case NFT_META_TIME_NS:
- nft_reg_store64(dest, ktime_get_real_ns());
+ nft_reg_store64((u64 *)dest, ktime_get_real_ns());
break;
case NFT_META_TIME_DAY:
nft_reg_store8(dest, nft_meta_weekday());
@@ -147,11 +146,11 @@ nft_meta_get_eval_skugid(enum nft_meta_keys key,
switch (key) {
case NFT_META_SKUID:
- *dest = from_kuid_munged(&init_user_ns,
+ *dest = from_kuid_munged(sock_net(sk)->user_ns,
sock->file->f_cred->fsuid);
break;
case NFT_META_SKGID:
- *dest = from_kgid_munged(&init_user_ns,
+ *dest = from_kgid_munged(sock_net(sk)->user_ns,
sock->file->f_cred->fsgid);
break;
default:
@@ -186,12 +185,12 @@ static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key,
case NFT_META_IIFKIND:
if (!in || !in->rtnl_link_ops)
return false;
- strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
+ strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
break;
case NFT_META_OIFKIND:
if (!out || !out->rtnl_link_ops)
return false;
- strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+ strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
break;
default:
return false;
@@ -207,7 +206,7 @@ static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev)
static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev)
{
- strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ);
+ strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ);
}
static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev)
@@ -244,7 +243,11 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
case NFT_META_OIF:
nft_meta_store_ifindex(dest, nft_out(pkt));
break;
- case NFT_META_IIFTYPE:
+ case NFT_META_IFTYPE:
+ if (!nft_meta_store_iftype(dest, pkt->skb->dev))
+ return false;
+ break;
+ case __NFT_META_IIFTYPE:
if (!nft_meta_store_iftype(dest, nft_in(pkt)))
return false;
break;
@@ -253,7 +256,7 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return false;
break;
case NFT_META_IIFGROUP:
- if (!nft_meta_store_ifgroup(dest, nft_out(pkt)))
+ if (!nft_meta_store_ifgroup(dest, nft_in(pkt)))
return false;
break;
case NFT_META_OIFGROUP:
@@ -267,13 +270,6 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
return true;
}
-static noinline u32 nft_prandom_u32(void)
-{
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
-
- return prandom_u32_state(state);
-}
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static noinline bool
nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
@@ -329,7 +325,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, nft_pf(pkt));
break;
case NFT_META_L4PROTO:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
goto err;
nft_reg_store8(dest, pkt->tprot);
break;
@@ -385,7 +381,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
#endif
case NFT_META_PRANDOM:
- *dest = nft_prandom_u32();
+ *dest = get_random_u32();
break;
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
@@ -462,7 +458,7 @@ EXPORT_SYMBOL_GPL(nft_meta_set_eval);
const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
- [NFTA_META_KEY] = { .type = NLA_U32 },
+ [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_META_SREG] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(nft_meta_policy);
@@ -514,7 +510,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
- prandom_init_once(&nft_prandom_state);
len = sizeof(u32);
break;
#ifdef CONFIG_XFRM
@@ -535,9 +530,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
@@ -586,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx)
}
static int nft_meta_get_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -605,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
}
int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -661,8 +654,8 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
@@ -674,7 +667,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
EXPORT_SYMBOL_GPL(nft_meta_set_init);
int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -689,7 +682,8 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -724,22 +718,22 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
switch (priv->key) {
case NFT_META_PROTOCOL:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
- sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+ sizeof(__u16), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case NFT_META_L4PROTO:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
- sizeof(__u8), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
break;
case NFT_META_IIF:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_ifindex, sizeof(__u32), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
break;
case NFT_META_IIFTYPE:
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
- ingress_iftype, sizeof(__u16), reg);
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
break;
default:
return -EOPNOTSUPP;
@@ -748,16 +742,60 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
return 0;
}
+bool nft_meta_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct nft_meta *meta;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ meta = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != meta->key ||
+ priv->dreg != meta->dreg) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_reduce);
+
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_get_eval,
.init = nft_meta_get_init,
.dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
.validate = nft_meta_get_validate,
.offload = nft_meta_get_offload,
};
+static bool nft_meta_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_meta_set_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -765,6 +803,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .reduce = nft_meta_set_reduce,
.validate = nft_meta_set_validate,
};
@@ -791,9 +830,74 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
+static int nft_meta_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG])
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ len = sizeof(u16);
+ break;
+ case NFT_META_L4PROTO:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ priv->len = len;
+
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+void nft_meta_inner_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ nft_reg_store16(dest, (__force u16)tun_ctx->llproto);
+ break;
+ case NFT_META_L4PROTO:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ nft_reg_store8(dest, tun_ctx->l4proto);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_inner_eval);
+
+static const struct nft_expr_ops nft_meta_inner_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .init = nft_meta_inner_init,
+ .dump = nft_meta_get_dump,
+ /* direct call to nft_meta_inner_eval(). */
+};
+
struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
+ .inner_ops = &nft_meta_inner_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
@@ -848,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_SECMARK_CTX] == NULL)
return -EINVAL;
- priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+ priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT);
if (!priv->ctx)
return -ENOMEM;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index 23a7bfd10521..6e21f72c5b57 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -21,10 +21,10 @@
#include <net/ip.h>
struct nft_nat {
- enum nft_registers sreg_addr_min:8;
- enum nft_registers sreg_addr_max:8;
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_addr_min;
+ u8 sreg_addr_max;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
enum nf_nat_manip_type type:8;
u8 family;
u16 flags;
@@ -132,16 +132,21 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
[NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_NAT_FLAGS] = { .type = NLA_U32 },
+ [NFTA_NAT_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
static int nft_nat_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct nft_nat *priv = nft_expr_priv(expr);
int err;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
if (err < 0)
return err;
@@ -201,23 +206,22 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
alen = sizeof_field(struct nf_nat_range, min_addr.ip6);
break;
default:
- return -EAFNOSUPPORT;
+ if (tb[NFTA_NAT_REG_ADDR_MIN])
+ return -EAFNOSUPPORT;
+ break;
}
priv->family = family;
if (tb[NFTA_NAT_REG_ADDR_MIN]) {
- priv->sreg_addr_min =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]);
- err = nft_validate_register_load(priv->sreg_addr_min, alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN],
+ &priv->sreg_addr_min, alen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_ADDR_MAX]) {
- priv->sreg_addr_max =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]);
-
- err = nft_validate_register_load(priv->sreg_addr_max,
- alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX],
+ &priv->sreg_addr_max,
+ alen);
if (err < 0)
return err;
} else {
@@ -227,21 +231,17 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags |= NF_NAT_RANGE_MAP_IPS;
}
- plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_NAT_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
@@ -251,16 +251,14 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
- if (tb[NFTA_NAT_FLAGS]) {
+ if (tb[NFTA_NAT_FLAGS])
priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EOPNOTSUPP;
- }
return nf_ct_netns_get(ctx->net, family);
}
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_nat *priv = nft_expr_priv(expr);
@@ -322,6 +320,7 @@ static const struct nft_expr_ops nft_nat_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_nat_type __read_mostly = {
@@ -339,7 +338,8 @@ static void nft_nat_inet_eval(const struct nft_expr *expr,
{
const struct nft_nat *priv = nft_expr_priv(expr);
- if (priv->family == nft_pf(pkt))
+ if (priv->family == nft_pf(pkt) ||
+ priv->family == NFPROTO_INET)
nft_nat_eval(expr, regs, pkt);
}
@@ -351,6 +351,7 @@ static const struct nft_expr_ops nft_nat_inet_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_inet_nat_type __read_mostly = {
@@ -402,3 +403,4 @@ module_exit(nft_nat_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
MODULE_ALIAS_NFT_EXPR("nat");
+MODULE_DESCRIPTION("Network Address Translation support");
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 48edb9d5f012..bd058babfc82 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -9,16 +9,15 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
#include <linux/static_key.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
struct nft_ng_inc {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
- atomic_t counter;
+ atomic_t *counter;
u32 offset;
};
@@ -27,9 +26,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
u32 nval, oval;
do {
- oval = atomic_read(&priv->counter);
+ oval = atomic_read(priv->counter);
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
- } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+ } while (atomic_cmpxchg(priv->counter, oval, nval) != oval);
return nval + priv->offset;
}
@@ -55,6 +54,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_ng_inc *priv = nft_expr_priv(expr);
+ int err;
if (tb[NFTA_NG_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
@@ -66,11 +66,32 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
- atomic_set(&priv->counter, priv->modulus - 1);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT);
+ if (!priv->counter)
+ return -ENOMEM;
+
+ atomic_set(priv->counter, priv->modulus - 1);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(priv->counter);
+
+ return err;
+}
+
+static bool nft_ng_inc_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
}
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
@@ -91,7 +112,8 @@ nla_put_failure:
return -1;
}
-static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_inc_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_inc *priv = nft_expr_priv(expr);
@@ -99,18 +121,23 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static void nft_ng_inc_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ kfree(priv->counter);
+}
+
struct nft_ng_random {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
};
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
{
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
- return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
- priv->offset;
+ return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
}
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -138,15 +165,12 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- prandom_init_once(&nft_numgen_prandom_state);
-
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
-
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_random_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -154,13 +178,25 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static bool nft_ng_random_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
.eval = nft_ng_inc_eval,
.init = nft_ng_inc_init,
+ .destroy = nft_ng_inc_destroy,
.dump = nft_ng_inc_dump,
+ .reduce = nft_ng_inc_reduce,
};
static const struct nft_expr_ops nft_ng_random_ops = {
@@ -169,6 +205,7 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.eval = nft_ng_random_eval,
.init = nft_ng_random_init,
.dump = nft_ng_random_dump,
+ .reduce = nft_ng_random_reduce,
};
static const struct nft_expr_ops *
@@ -217,3 +254,4 @@ module_exit(nft_ng_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("numgen");
+MODULE_DESCRIPTION("nftables number generator module");
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index bfd18d2b65a2..1a62e384766a 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -9,19 +9,48 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
-static void nft_objref_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_object *obj = nft_objref_priv(expr);
obj->ops->eval(obj, regs, pkt);
}
+static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type)
+{
+ unsigned int hooks;
+
+ switch (type) {
+ case NFT_OBJECT_SYNPROXY:
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD);
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_objref_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, obj->ops->type->type);
+}
+
static int nft_objref_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -41,13 +70,16 @@ static int nft_objref_init(const struct nft_ctx *ctx,
if (IS_ERR(obj))
return -ENOENT;
+ if (!nft_use_inc(&obj->use))
+ return -EMFILE;
+
nft_objref_priv(expr) = obj;
- obj->use++;
return 0;
}
-static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_object *obj = nft_objref_priv(expr);
@@ -71,7 +103,7 @@ static void nft_objref_deactivate(const struct nft_ctx *ctx,
if (phase == NFT_TRANS_COMMIT)
return;
- obj->use--;
+ nft_use_dec(&obj->use);
}
static void nft_objref_activate(const struct nft_ctx *ctx,
@@ -79,10 +111,9 @@ static void nft_objref_activate(const struct nft_ctx *ctx,
{
struct nft_object *obj = nft_objref_priv(expr);
- obj->use++;
+ nft_use_inc_restore(&obj->use);
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
@@ -91,29 +122,33 @@ static const struct nft_expr_ops nft_objref_ops = {
.activate = nft_objref_activate,
.deactivate = nft_objref_deactivate,
.dump = nft_objref_dump,
+ .validate = nft_objref_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_objref_map {
struct nft_set *set;
- enum nft_registers sreg:8;
+ u8 sreg;
struct nft_set_binding binding;
};
-static void nft_objref_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
+ struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
struct nft_object *obj;
- bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext);
- if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ if (!ext) {
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
obj = *nft_set_ext_obj(ext);
obj->ops->eval(obj, regs, pkt);
@@ -137,8 +172,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_OBJECT))
return -EINVAL;
- priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
@@ -152,7 +187,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_objref_map *priv = nft_expr_priv(expr);
@@ -180,7 +216,7 @@ static void nft_objref_map_activate(const struct nft_ctx *ctx,
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- priv->set->use++;
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -191,7 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static struct nft_expr_type nft_objref_type;
+static int nft_objref_map_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, priv->set->objtype);
+}
+
static const struct nft_expr_ops nft_objref_map_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -201,6 +244,8 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
+ .validate = nft_objref_map_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -228,27 +273,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
[NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
};
-static struct nft_expr_type nft_objref_type __read_mostly = {
+struct nft_expr_type nft_objref_type __read_mostly = {
.name = "objref",
.select_ops = nft_objref_select_ops,
.policy = nft_objref_policy,
.maxattr = NFTA_OBJREF_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_objref_module_init(void)
-{
- return nft_register_expr(&nft_objref_type);
-}
-
-static void __exit nft_objref_module_exit(void)
-{
- nft_unregister_expr(&nft_objref_type);
-}
-
-module_init(nft_objref_module_init);
-module_exit(nft_objref_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index b42247aa48a9..1c0b493ef0a9 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -6,7 +6,7 @@
#include <linux/netfilter/nfnetlink_osf.h>
struct nft_osf {
- enum nft_registers dreg:8;
+ u8 dreg;
u8 ttl;
u32 flags;
};
@@ -23,11 +23,16 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nft_osf *priv = nft_expr_priv(expr);
u32 *dest = &regs->data[priv->dreg];
struct sk_buff *skb = pkt->skb;
- char os_match[NFT_OSF_MAXGENRELEN + 1];
+ char os_match[NFT_OSF_MAXGENRELEN];
const struct tcphdr *tcp;
struct nf_osf_data data;
struct tcphdr _tcph;
+ if (pkt->tprot != IPPROTO_TCP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
tcp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(struct tcphdr), &_tcph);
if (!tcp) {
@@ -40,15 +45,15 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
}
if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) {
- strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
+ strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
} else {
if (priv->flags & NFT_OSF_F_VERSION)
snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
data.genre, data.version);
else
- strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
+ strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
- strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
+ strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
}
}
@@ -58,7 +63,6 @@ static int nft_osf_init(const struct nft_ctx *ctx,
{
struct nft_osf *priv = nft_expr_priv(expr);
u32 flags;
- int err;
u8 ttl;
if (!tb[NFTA_OSF_DREG])
@@ -78,23 +82,20 @@ static int nft_osf_init(const struct nft_ctx *ctx,
priv->flags = flags;
}
- priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
- if (err < 0)
- return err;
-
- return 0;
+ return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE,
+ NFT_OSF_MAXGENRELEN);
}
-static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_osf_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_osf *priv = nft_expr_priv(expr);
if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_OSF_FLAGS, ntohl(priv->flags)))
+ if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags)))
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
@@ -107,12 +108,47 @@ nla_put_failure:
}
static int nft_osf_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
+{
+ unsigned int hooks;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static bool nft_osf_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_FORWARD));
+ struct nft_osf *priv = nft_expr_priv(expr);
+ struct nft_osf *osf;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ osf = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->flags != osf->flags ||
+ priv->ttl != osf->ttl) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
}
static struct nft_expr_type nft_osf_type;
@@ -123,6 +159,7 @@ static const struct nft_expr_ops nft_osf_op = {
.dump = nft_osf_dump,
.type = &nft_osf_type,
.validate = nft_osf_validate,
+ .reduce = nft_osf_reduce,
};
static struct nft_expr_type nft_osf_type __read_mostly = {
@@ -149,3 +186,4 @@ module_exit(nft_osf_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("osf");
+MODULE_DESCRIPTION("nftables passive OS fingerprint support");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index a7de3a58f553..b0214418f75a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -19,9 +19,11 @@
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <net/gre.h>
#include <linux/icmpv6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <net/sctp/checksum.h>
static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
struct vlan_ethhdr *veth)
@@ -38,46 +40,122 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
-nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
u8 *vlanh, *dst_u8 = (u8 *) d;
struct vlan_ethhdr veth;
- u8 vlan_hlen = 0;
-
- if ((skb->protocol == htons(ETH_P_8021AD) ||
- skb->protocol == htons(ETH_P_8021Q)) &&
- offset >= VLAN_ETH_HLEN && offset < VLAN_ETH_HLEN + VLAN_HLEN)
- vlan_hlen += VLAN_HLEN;
vlanh = (u8 *) &veth;
- if (offset < VLAN_ETH_HLEN + vlan_hlen) {
+ if (offset < VLAN_ETH_HLEN) {
u8 ethlen = len;
- if (vlan_hlen &&
- skb_copy_bits(skb, mac_off, &veth, VLAN_ETH_HLEN) < 0)
- return false;
- else if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
+ if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
return false;
- if (offset + len > VLAN_ETH_HLEN + vlan_hlen)
- ethlen -= offset + len - VLAN_ETH_HLEN + vlan_hlen;
+ if (offset + len > VLAN_ETH_HLEN)
+ ethlen -= offset + len - VLAN_ETH_HLEN;
- memcpy(dst_u8, vlanh + offset - vlan_hlen, ethlen);
+ memcpy(dst_u8, vlanh + offset, ethlen);
len -= ethlen;
if (len == 0)
return true;
dst_u8 += ethlen;
- offset = ETH_HLEN + vlan_hlen;
+ offset = ETH_HLEN;
} else {
- offset -= VLAN_HLEN + vlan_hlen;
+ offset -= VLAN_HLEN;
}
return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
}
+static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
+{
+ unsigned int thoff = nft_thoff(pkt);
+
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ return -1;
+
+ switch (pkt->tprot) {
+ case IPPROTO_UDP:
+ pkt->inneroff = thoff + sizeof(struct udphdr);
+ break;
+ case IPPROTO_TCP: {
+ struct tcphdr *th, _tcph;
+
+ th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph);
+ if (!th)
+ return -1;
+
+ pkt->inneroff = thoff + __tcp_hdrlen(th);
+ }
+ break;
+ case IPPROTO_GRE: {
+ u32 offset = sizeof(struct gre_base_hdr);
+ struct gre_base_hdr *gre, _gre;
+ __be16 version;
+
+ gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre);
+ if (!gre)
+ return -1;
+
+ version = gre->flags & GRE_VERSION;
+ switch (version) {
+ case GRE_VERSION_0:
+ if (gre->flags & GRE_ROUTING)
+ return -1;
+
+ if (gre->flags & GRE_CSUM) {
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
+ }
+ if (gre->flags & GRE_KEY)
+ offset += sizeof_field(struct gre_full_hdr, key);
+
+ if (gre->flags & GRE_SEQ)
+ offset += sizeof_field(struct gre_full_hdr, seq);
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->inneroff = thoff + offset;
+ }
+ break;
+ case IPPROTO_IPIP:
+ pkt->inneroff = thoff;
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->flags |= NFT_PKTINFO_INNER;
+
+ return 0;
+}
+
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER) &&
+ __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
+ return -1;
+
+ return pkt->inneroff;
+}
+
+static bool nft_payload_need_vlan_adjust(u32 offset, u32 len)
+{
+ unsigned int boundary = offset + len;
+
+ /* data past ether src/dst requested, copy needed */
+ if (boundary > offsetof(struct ethhdr, h_proto))
+ return true;
+
+ return false;
+}
+
void nft_payload_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -87,13 +165,16 @@ void nft_payload_eval(const struct nft_expr *expr,
u32 *dest = &regs->data[priv->dreg];
int offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
- if (!skb_mac_header_was_set(skb))
+ if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0)
goto err;
- if (skb_vlan_tag_present(skb)) {
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
if (!nft_payload_copy_vlan(dest, skb,
priv->offset, priv->len))
goto err;
@@ -105,12 +186,18 @@ void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
offset += priv->offset;
@@ -125,10 +212,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 },
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
@@ -141,13 +228,14 @@ static int nft_payload_init(const struct nft_ctx *ctx,
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload *priv = nft_expr_priv(expr);
@@ -162,6 +250,59 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct nft_payload *payload;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ payload = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->base != payload->base ||
+ priv->offset != payload->offset ||
+ priv->len != payload->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static bool nft_payload_offload_mask(struct nft_offload_reg *reg,
+ u32 priv_len, u32 field_len)
+{
+ unsigned int remainder, delta, k;
+ struct nft_data mask = {};
+ __be32 remainder_mask;
+
+ if (priv_len == field_len) {
+ memset(&reg->mask, 0xff, priv_len);
+ return true;
+ } else if (priv_len > field_len) {
+ return false;
+ }
+
+ memset(&mask, 0xff, field_len);
+ remainder = priv_len % sizeof(u32);
+ if (remainder) {
+ k = priv_len / sizeof(u32);
+ delta = field_len - priv_len;
+ remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1));
+ mask.data[k] = (__force u32)remainder_mask;
+ }
+
+ memcpy(&reg->mask, &mask, field_len);
+
+ return true;
+}
+
static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_payload *priv)
@@ -170,21 +311,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ethhdr, h_source):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
src, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_dest):
- if (priv->len != ETH_ALEN)
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
dst, ETH_ALEN, reg);
break;
case offsetof(struct ethhdr, h_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
@@ -192,14 +333,15 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
- vlan_tci, sizeof(__be16), reg);
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
@@ -207,19 +349,21 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
- vlan_tci, sizeof(__be16), reg);
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
break;
case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
sizeof(struct vlan_hdr):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
- NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan,
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
break;
default:
return -EOPNOTSUPP;
@@ -236,21 +380,25 @@ static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct iphdr, saddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, daddr):
- if (priv->len != sizeof(struct in_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
break;
case offsetof(struct iphdr, protocol):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -272,21 +420,25 @@ static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct ipv6hdr, saddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, daddr):
- if (priv->len != sizeof(struct in6_addr))
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
break;
case offsetof(struct ipv6hdr, nexthdr):
- if (priv->len != sizeof(__u8))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
@@ -328,14 +480,14 @@ static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct tcphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct tcphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -356,14 +508,14 @@ static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
switch (priv->offset) {
case offsetof(struct udphdr, source):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
sizeof(__be16), reg);
break;
case offsetof(struct udphdr, dest):
- if (priv->len != sizeof(__be16))
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
return -EOPNOTSUPP;
NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
@@ -426,6 +578,7 @@ static const struct nft_expr_ops nft_payload_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
@@ -435,12 +588,103 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
.offload = nft_payload_offload,
};
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ int offset;
+
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN))
+ goto err;
+
+ offset = tun_ctx->inner_tunoff;
+ break;
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL))
+ goto err;
+
+ offset = tun_ctx->inner_lloff;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH))
+ goto err;
+
+ offset = tun_ctx->inner_nhoff;
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ offset = tun_ctx->inner_thoff;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ offset += priv->offset;
+
+ if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+ goto err;
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+ u32 base;
+
+ if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] ||
+ !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG])
+ return -EINVAL;
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->base = base;
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
+}
+
+static const struct nft_expr_ops nft_payload_inner_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .init = nft_payload_inner_init,
+ .dump = nft_payload_dump,
+ /* direct call to nft_payload_inner_eval(). */
+};
+
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum);
if (*sum == 0)
*sum = CSUM_MANGLED_0;
}
@@ -460,14 +704,17 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
struct sk_buff *skb,
unsigned int *l4csum_offset)
{
+ if (pkt->fragoff)
+ return -1;
+
switch (pkt->tprot) {
case IPPROTO_TCP:
*l4csum_offset = offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
- if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ if (!nft_payload_udp_checksum(skb, nft_thoff(pkt)))
return -1;
- /* Fall through. */
+ fallthrough;
case IPPROTO_UDPLITE:
*l4csum_offset = offsetof(struct udphdr, check);
break;
@@ -478,7 +725,20 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
return -1;
}
- *l4csum_offset += pkt->xt.thoff;
+ *l4csum_offset += nft_thoff(pkt);
+ return 0;
+}
+
+static int nft_payload_csum_sctp(struct sk_buff *skb, int offset)
+{
+ struct sctphdr *sh;
+
+ if (skb_ensure_writable(skb, offset + sizeof(*sh)))
+ return -1;
+
+ sh = (struct sctphdr *)(skb->data + offset);
+ sh->checksum = sctp_compute_cksum(skb, offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
}
@@ -535,40 +795,118 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return 0;
}
+struct nft_payload_set {
+ enum nft_payload_bases base:8;
+ u16 offset;
+ u8 len;
+ u8 sreg;
+ u8 csum_type;
+ u8 csum_offset;
+ u8 csum_flags;
+};
+
+/* This is not struct vlan_hdr. */
+struct nft_payload_vlan_hdr {
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
+};
+
+static bool
+nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
+ int *vlan_hlen)
+{
+ struct nft_payload_vlan_hdr *vlanh;
+ __be16 vlan_proto;
+ u16 vlan_tci;
+
+ if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) {
+ *vlan_hlen = VLAN_HLEN;
+ return true;
+ }
+
+ switch (offset) {
+ case offsetof(struct vlan_ethhdr, h_vlan_proto):
+ if (len == 2) {
+ vlan_proto = nft_reg_load_be16(src);
+ skb->vlan_proto = vlan_proto;
+ } else if (len == 4) {
+ vlanh = (struct nft_payload_vlan_hdr *)src;
+ __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto,
+ ntohs(vlanh->h_vlan_TCI));
+ } else {
+ return false;
+ }
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (len != 2)
+ return false;
+
+ vlan_tci = ntohs(nft_reg_load_be16(src));
+ skb->vlan_tci = vlan_tci;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
- struct sk_buff *skb = pkt->skb;
const u32 *src = &regs->data[priv->sreg];
- int offset, csum_offset;
+ int offset, csum_offset, vlan_hlen = 0;
+ struct sk_buff *skb = pkt->skb;
__wsum fsum, tsum;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
goto err;
- offset = skb_mac_header(skb) - skb->data;
+
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
+ if (!nft_payload_set_vlan(src, skb,
+ priv->offset, priv->len,
+ &vlan_hlen))
+ goto err;
+
+ if (!vlan_hlen)
+ return;
+ }
+
+ offset = skb_mac_header(skb) - skb->data - vlan_hlen;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
csum_offset = offset + priv->csum_offset;
offset += priv->offset;
if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
- (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
+ ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER &&
+ priv->base != NFT_PAYLOAD_INNER_HEADER) ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
+ if (offset + priv->len > skb->len)
+ goto err;
+
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
@@ -585,6 +923,14 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
+ pkt->tprot == IPPROTO_SCTP &&
+ skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (pkt->fragoff == 0 &&
+ nft_payload_csum_sctp(skb, nft_thoff(pkt)))
+ goto err;
+ }
+
return;
err:
regs->verdict.code = NFT_BREAK;
@@ -594,19 +940,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
struct nft_payload_set *priv = nft_expr_priv(expr);
+ int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
- priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return err;
+ priv->offset = offset;
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
- priv->csum_type =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
- if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
- priv->csum_offset =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
+ if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX,
+ &csum_offset);
+ if (err < 0)
+ return err;
+
+ priv->csum_offset = csum_offset;
+ }
if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
u32 flags;
@@ -617,18 +972,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->csum_flags = flags;
}
- switch (priv->csum_type) {
+ switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
case NFT_PAYLOAD_CSUM_INET:
break;
+ case NFT_PAYLOAD_CSUM_SCTP:
+ if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
+ return -EINVAL;
+
+ if (priv->csum_offset != offsetof(struct sctphdr, checksum))
+ return -EINVAL;
+ break;
default:
return -EOPNOTSUPP;
}
+ priv->csum_type = csum_type;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg,
+ priv->len);
}
-static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
@@ -647,12 +1012,32 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_payload_ops &&
+ track->regs[i].selector->ops != &nft_payload_fast_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_payload_set_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
.eval = nft_payload_set_eval,
.init = nft_payload_set_init,
.dump = nft_payload_set_dump,
+ .reduce = nft_payload_set_reduce,
};
static const struct nft_expr_ops *
@@ -661,6 +1046,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
{
enum nft_payload_bases base;
unsigned int offset, len;
+ int err;
if (tb[NFTA_PAYLOAD_BASE] == NULL ||
tb[NFTA_PAYLOAD_OFFSET] == NULL ||
@@ -672,6 +1058,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
case NFT_PAYLOAD_LL_HEADER:
case NFT_PAYLOAD_NETWORK_HEADER:
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ case NFT_PAYLOAD_INNER_HEADER:
break;
default:
return ERR_PTR(-EOPNOTSUPP);
@@ -686,11 +1073,16 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
- base != NFT_PAYLOAD_LL_HEADER)
+ base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER)
return &nft_payload_fast_ops;
else
return &nft_payload_ops;
@@ -699,6 +1091,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
+ .inner_ops = &nft_payload_inner_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 5ece0a6aa8c3..344fe311878f 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -19,10 +19,10 @@
static u32 jhash_initval __read_mostly;
struct nft_queue {
- enum nft_registers sreg_qnum:8;
- u16 queuenum;
- u16 queues_total;
- u16 flags;
+ u8 sreg_qnum;
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
};
static void nft_queue_eval(const struct nft_expr *expr,
@@ -68,6 +68,30 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static int nft_queue_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ break;
+ case NFPROTO_NETDEV: /* lacks okfn */
+ fallthrough;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, supported_hooks);
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
@@ -111,8 +135,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
struct nft_queue *priv = nft_expr_priv(expr);
int err;
- priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
- err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+ err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM],
+ &priv->sreg_qnum, sizeof(u32));
if (err < 0)
return err;
@@ -127,7 +151,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -143,7 +168,8 @@ nla_put_failure:
}
static int
-nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_queue_sreg_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -164,6 +190,8 @@ static const struct nft_expr_ops nft_queue_ops = {
.eval = nft_queue_eval,
.init = nft_queue_init,
.dump = nft_queue_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_queue_sreg_ops = {
@@ -172,6 +200,8 @@ static const struct nft_expr_ops nft_queue_sreg_ops = {
.eval = nft_queue_sreg_eval,
.init = nft_queue_sreg_init,
.dump = nft_queue_sreg_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -216,3 +246,4 @@ module_exit(nft_queue_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Leblond <eric@regit.org>");
MODULE_ALIAS_NFT_EXPR("queue");
+MODULE_DESCRIPTION("Netfilter nftables queue module");
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 4413690591f2..df0798da2329 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -15,14 +15,20 @@
struct nft_quota {
atomic64_t quota;
unsigned long flags;
- atomic64_t consumed;
+ atomic64_t *consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ bool *report)
{
- return atomic64_add_return(skb->len, &priv->consumed) >=
- atomic64_read(&priv->quota);
+ u64 consumed = atomic64_add_return(skb->len, priv->consumed);
+ u64 quota = atomic64_read(&priv->quota);
+
+ if (report)
+ *report = consumed >= quota;
+
+ return consumed > quota;
}
static inline bool nft_quota_invert(struct nft_quota *priv)
@@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
+ if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
@@ -51,16 +57,16 @@ static void nft_quota_obj_eval(struct nft_object *obj,
const struct nft_pktinfo *pkt)
{
struct nft_quota *priv = nft_obj_data(obj);
- bool overquota;
+ bool overquota, report;
- overquota = nft_overquota(priv, pkt->skb);
+ overquota = nft_overquota(priv, pkt->skb, &report);
if (overquota ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
- if (overquota &&
+ if (report &&
!test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0,
- NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+ NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC);
}
static int nft_quota_do_init(const struct nlattr * const tb[],
@@ -90,13 +96,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
+ if (!priv->consumed)
+ return -ENOMEM;
+
atomic64_set(&priv->quota, quota);
priv->flags = flags;
- atomic64_set(&priv->consumed, consumed);
+ atomic64_set(priv->consumed, consumed);
return 0;
}
+static void nft_quota_do_destroy(const struct nft_ctx *ctx,
+ struct nft_quota *priv)
+{
+ kfree(priv->consumed);
+}
+
static int nft_quota_obj_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
@@ -128,7 +144,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
* that we see, don't go over the quota boundary in what we send to
* userspace.
*/
- consumed = atomic64_read(&priv->consumed);
+ consumed = atomic64_read(priv->consumed);
quota = atomic64_read(&priv->quota);
if (consumed >= quota) {
consumed_cap = quota;
@@ -145,7 +161,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
goto nla_put_failure;
if (reset) {
- atomic64_sub(consumed, &priv->consumed);
+ atomic64_sub(consumed, priv->consumed);
clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
}
return 0;
@@ -162,11 +178,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
return nft_quota_do_dump(skb, priv, reset);
}
+static void nft_quota_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_quota_obj_type;
static const struct nft_object_ops nft_quota_obj_ops = {
.type = &nft_quota_obj_type,
.size = sizeof(struct nft_quota),
.init = nft_quota_obj_init,
+ .destroy = nft_quota_obj_destroy,
.eval = nft_quota_obj_eval,
.dump = nft_quota_obj_dump,
.update = nft_quota_obj_update,
@@ -198,11 +223,37 @@ static int nft_quota_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_quota *priv = nft_expr_priv(expr);
- return nft_quota_do_dump(skb, priv, false);
+ return nft_quota_do_dump(skb, priv, reset);
+}
+
+static void nft_quota_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
+{
+ struct nft_quota *priv_dst = nft_expr_priv(dst);
+ struct nft_quota *priv_src = nft_expr_priv(src);
+
+ priv_dst->quota = priv_src->quota;
+ priv_dst->flags = priv_src->flags;
+
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp);
+ if (!priv_dst->consumed)
+ return -ENOMEM;
+
+ *priv_dst->consumed = *priv_src->consumed;
+
+ return 0;
}
static struct nft_expr_type nft_quota_type;
@@ -211,7 +262,10 @@ static const struct nft_expr_ops nft_quota_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
.eval = nft_quota_eval,
.init = nft_quota_init,
+ .destroy = nft_quota_destroy,
+ .clone = nft_quota_clone,
.dump = nft_quota_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_quota_type __read_mostly = {
@@ -254,3 +308,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("quota");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
+MODULE_DESCRIPTION("Netfilter nftables quota module");
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 89efcc5a533d..ea382f7bbd78 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -15,7 +15,7 @@
struct nft_range_expr {
struct nft_data data_from;
struct nft_data data_to;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_range_ops op:8;
};
@@ -42,7 +42,7 @@ void nft_range_eval(const struct nft_expr *expr,
static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
[NFTA_RANGE_SREG] = { .type = NLA_U32 },
- [NFTA_RANGE_OP] = { .type = NLA_U32 },
+ [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
[NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
};
@@ -51,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
const struct nlattr * const tb[])
{
struct nft_range_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc_from, desc_to;
+ struct nft_data_desc desc_from = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_from),
+ };
+ struct nft_data_desc desc_to = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_to),
+ };
int err;
u32 op;
@@ -61,33 +68,23 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
!tb[NFTA_RANGE_TO_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
- &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ err = nft_data_init(NULL, &priv->data_from, &desc_from,
+ tb[NFTA_RANGE_FROM_DATA]);
if (err < 0)
return err;
- if (desc_from.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
- &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ err = nft_data_init(NULL, &priv->data_to, &desc_to,
+ tb[NFTA_RANGE_TO_DATA]);
if (err < 0)
goto err1;
- if (desc_to.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err2;
- }
-
if (desc_from.len != desc_to.len) {
err = -EINVAL;
goto err2;
}
- priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
- err = nft_validate_register_load(priv->sreg, desc_from.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg,
+ desc_from.len);
if (err < 0)
goto err2;
@@ -114,7 +111,8 @@ err1:
return err;
}
-static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_range_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_range_expr *priv = nft_expr_priv(expr);
@@ -140,6 +138,7 @@ static const struct nft_expr_ops nft_range_ops = {
.eval = nft_range_eval,
.init = nft_range_init,
.dump = nft_range_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_range_type __read_mostly = {
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5b779171565c..95eedad85c83 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -14,20 +14,20 @@
#include <net/netfilter/nf_tables.h>
struct nft_redir {
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
u16 flags;
};
static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
[NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_REDIR_FLAGS] = { .type = NLA_U32 },
+ [NFTA_REDIR_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
static int nft_redir_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
int err;
@@ -48,38 +48,34 @@ static int nft_redir_init(const struct nft_ctx *ctx,
unsigned int plen;
int err;
- plen = sizeof_field(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_REDIR_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
priv->sreg_proto_max = priv->sreg_proto_min;
}
+
+ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
- if (tb[NFTA_REDIR_FLAGS]) {
+ if (tb[NFTA_REDIR_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_redir *priv = nft_expr_priv(expr);
@@ -102,25 +98,37 @@ nla_put_failure:
return -1;
}
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_redir_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_ipv4_multi_range_compat mr;
+ const struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_range2 range;
- memset(&mr, 0, sizeof(mr));
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
if (priv->sreg_proto_min) {
- mr.range[0].min.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- mr.range[0].max.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
}
- mr.range[0].flags |= priv->flags;
-
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void
@@ -133,11 +141,12 @@ static struct nft_expr_type nft_redir_ipv4_type;
static const struct nft_expr_ops nft_redir_ipv4_ops = {
.type = &nft_redir_ipv4_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv4_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_ipv4_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
@@ -150,28 +159,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
};
#ifdef CONFIG_NF_TABLES_IPV6
-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
-
- range.flags |= priv->flags;
-
- regs->verdict.code =
- nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
-}
-
static void
nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -182,11 +169,12 @@ static struct nft_expr_type nft_redir_ipv6_type;
static const struct nft_expr_ops nft_redir_ipv6_ops = {
.type = &nft_redir_ipv6_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv6_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_ipv6_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
@@ -200,20 +188,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
#endif
#ifdef CONFIG_NF_TABLES_INET
-static void nft_redir_inet_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- return nft_redir_ipv4_eval(expr, regs, pkt);
- case NFPROTO_IPV6:
- return nft_redir_ipv6_eval(expr, regs, pkt);
- }
-
- WARN_ON_ONCE(1);
-}
-
static void
nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
@@ -224,11 +198,12 @@ static struct nft_expr_type nft_redir_inet_type;
static const struct nft_expr_ops nft_redir_inet_ops = {
.type = &nft_redir_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_inet_eval,
+ .eval = nft_redir_eval,
.init = nft_redir_init,
.destroy = nft_redir_inet_destroy,
.dump = nft_redir_dump,
.validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_redir_inet_type __read_mostly = {
@@ -236,7 +211,7 @@ static struct nft_expr_type nft_redir_inet_type __read_mostly = {
.name = "redir",
.ops = &nft_redir_inet_ops,
.policy = nft_redir_policy,
- .maxattr = NFTA_MASQ_MAX,
+ .maxattr = NFTA_REDIR_MAX,
.owner = THIS_MODULE,
};
@@ -292,3 +267,4 @@ module_exit(nft_redir_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
MODULE_ALIAS_NFT_EXPR("redir");
+MODULE_DESCRIPTION("Netfilter nftables redirect support");
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 00f865fb80ca..196a92c7ea09 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -18,19 +18,19 @@
#include <linux/icmpv6.h>
const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
- [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
+ [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
};
EXPORT_SYMBOL_GPL(nft_reject_policy);
int nft_reject_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT));
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING));
}
EXPORT_SYMBOL_GPL(nft_reject_validate);
@@ -39,6 +39,7 @@ int nft_reject_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -46,9 +47,17 @@ int nft_reject_init(const struct nft_ctx *ctx,
priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
return -EINVAL;
- priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
case NFT_REJECT_TCP_RST:
break;
default:
@@ -59,7 +68,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_reject_init);
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_reject *priv = nft_expr_priv(expr);
@@ -68,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
@@ -119,3 +130,4 @@ EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index f41f414b72d1..49020e67304a 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -28,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
@@ -44,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach6(nft_net(pkt), pkt->skb,
@@ -58,60 +60,15 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
regs->verdict.code = NF_DROP;
}
-static int nft_reject_inet_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_reject_inet_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_inet_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_INGRESS));
}
static struct nft_expr_type nft_reject_inet_type;
@@ -119,9 +76,10 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.type = &nft_reject_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_inet_eval,
- .init = nft_reject_inet_init,
- .dump = nft_reject_inet_dump,
- .validate = nft_reject_validate,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_inet_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
@@ -149,3 +107,4 @@ module_exit(nft_reject_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(1, "reject");
+MODULE_DESCRIPTION("Netfilter nftables reject inet support");
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
new file mode 100644
index 000000000000..2558ce1505d9
--- /dev/null
+++ b/net/netfilter/nft_reject_netdev.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com>
+ * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb)
+{
+ dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol),
+ eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest,
+ nskb->len);
+ dev_queue_xmit(nskb);
+}
+
+static void nft_reject_netdev_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+
+static void nft_reject_netdev_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct ethhdr *eth = eth_hdr(pkt->skb);
+ struct nft_reject *priv = nft_expr_priv(expr);
+ const unsigned char *dest = eth->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
+static struct nft_expr_type nft_reject_netdev_type;
+static const struct nft_expr_ops nft_reject_netdev_ops = {
+ .type = &nft_reject_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_netdev_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_netdev_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "reject",
+ .ops = &nft_reject_netdev_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_reject_netdev_type);
+}
+
+static void __exit nft_reject_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_netdev_type);
+}
+
+module_init(nft_reject_netdev_module_init);
+module_exit(nft_reject_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>");
+MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>");
+MODULE_DESCRIPTION("Reject packets from netdev via nftables");
+MODULE_ALIAS_NFT_AF_EXPR(5, "reject");
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 7cfcb0e2f7ee..dc50b9a5bd68 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -15,7 +15,7 @@
struct nft_rt {
enum nft_rt_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
};
static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst)
@@ -73,14 +73,14 @@ void nft_rt_get_eval(const struct nft_expr *expr,
if (nft_pf(pkt) != NFPROTO_IPV4)
goto err;
- *dest = (__force u32)rt_nexthop((const struct rtable *)dst,
+ *dest = (__force u32)rt_nexthop(dst_rtable(dst),
ip_hdr(skb)->daddr);
break;
case NFT_RT_NEXTHOP6:
if (nft_pf(pkt) != NFPROTO_IPV6)
goto err;
- memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+ memcpy(dest, rt6_nexthop(dst_rt6_info(dst),
&ipv6_hdr(skb)->daddr),
sizeof(struct in6_addr));
break;
@@ -104,7 +104,7 @@ err:
static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
[NFTA_RT_DREG] = { .type = NLA_U32 },
- [NFTA_RT_KEY] = { .type = NLA_U32 },
+ [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_rt_get_init(const struct nft_ctx *ctx,
@@ -141,13 +141,12 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_rt_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_rt *priv = nft_expr_priv(expr);
@@ -161,12 +160,16 @@ nla_put_failure:
return -1;
}
-static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_rt *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->key) {
case NFT_RT_NEXTHOP4:
case NFT_RT_NEXTHOP6:
@@ -192,6 +195,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.init = nft_rt_get_init,
.dump = nft_rt_get_dump,
.validate = nft_rt_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_rt_type __read_mostly = {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 2a81ea421819..8d3f040a904a 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -13,6 +13,7 @@
#include <net/netfilter/nf_tables_core.h>
struct nft_bitmap_elem {
+ struct nft_elem_priv priv;
struct list_head head;
struct nft_set_ext ext;
};
@@ -21,7 +22,7 @@ struct nft_bitmap_elem {
* the element state in the current and the future generation.
*
* An element can be in three states. The generation cursor is represented using
- * the ^ character, note that this cursor shifts on every succesful transaction.
+ * the ^ character, note that this cursor shifts on every successful transaction.
* If no transaction is going on, we observe all elements are in the following
* state:
*
@@ -39,7 +40,7 @@ struct nft_bitmap_elem {
* 10 = this element is active in the current generation and it becomes inactive
* ^ in the next one. This happens when the element is deactivated but commit
* path has not yet been executed yet, so removal is still pending. On
- * transation abortion, the next generation bit is reset to go back to
+ * transaction abortion, the next generation bit is reset to go back to
* restore its previous state.
*/
struct nft_bitmap {
@@ -73,26 +74,34 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
return (bitmap[idx] & (0x3 << off)) & (genmask << off);
}
-static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
const struct nft_bitmap *priv = nft_set_priv(set);
+ static const struct nft_set_ext found;
u8 genmask = nft_genmask_cur(net);
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
- return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+ if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+ return &found;
+
+ return NULL;
}
static struct nft_bitmap_elem *
-nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+nft_bitmap_elem_find(const struct net *net,
+ const struct nft_set *set, struct nft_bitmap_elem *this,
u8 genmask)
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(net)->commit_mutex)) {
if (memcmp(nft_set_ext_key(&be->ext),
nft_set_ext_key(&this->ext), set->klen) ||
!nft_set_elem_active(&be->ext, genmask))
@@ -103,8 +112,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
return NULL;
}
-static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_bitmap_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
const struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -115,23 +125,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
!nft_set_elem_active(&be->ext, genmask))
continue;
- return be;
+ return &be->priv;
}
return ERR_PTR(-ENOENT);
}
static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *new = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
- be = nft_bitmap_elem_find(set, new, genmask);
+ be = nft_bitmap_elem_find(net, set, new, genmask);
if (be) {
- *ext = &be->ext;
+ *elem_priv = &be->priv;
return -EEXIST;
}
@@ -143,12 +153,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
return 0;
}
-static void nft_bitmap_remove(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static void nft_bitmap_remove(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
@@ -160,47 +169,46 @@ static void nft_bitmap_remove(const struct net *net,
static void nft_bitmap_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 11 state. */
priv->bitmap[idx] |= (genmask << off);
- nft_set_elem_change_active(net, set, &be->ext);
+ nft_clear(net, &be->ext);
}
-static bool nft_bitmap_flush(const struct net *net,
- const struct nft_set *set, void *_be)
+static void nft_bitmap_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
- struct nft_bitmap_elem *be = _be;
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 10 state, similar to deactivation. */
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
-
- return true;
}
-static void *nft_bitmap_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_bitmap_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *this = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, elem->key.val.data, &idx, &off);
- be = nft_bitmap_elem_find(set, this, genmask);
+ be = nft_bitmap_elem_find(net, set, this, genmask);
if (!be)
return NULL;
@@ -208,7 +216,7 @@ static void *nft_bitmap_deactivate(const struct net *net,
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
- return be;
+ return &be->priv;
}
static void nft_bitmap_walk(const struct nft_ctx *ctx,
@@ -217,17 +225,13 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- struct nft_set_elem elem;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&be->ext, iter->genmask))
- goto cont;
- elem.priv = be;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &be->priv);
if (iter->err < 0)
return;
@@ -264,19 +268,22 @@ static int nft_bitmap_init(const struct nft_set *set,
{
struct nft_bitmap *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0);
+
INIT_LIST_HEAD(&priv->list);
priv->bitmap_size = nft_bitmap_size(set->klen);
return 0;
}
-static void nft_bitmap_destroy(const struct nft_set *set)
+static void nft_bitmap_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be, *n;
list_for_each_entry_safe(be, n, &priv->list, head)
- nft_set_elem_destroy(set, be, true);
+ nf_tables_set_elem_destroy(ctx, set, &be->priv);
}
static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 4d3f147e8d8d..ba01ce75d6de 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -24,10 +24,14 @@
struct nft_rhash {
struct rhashtable ht;
struct delayed_work gc_work;
+ u32 wq_gc_seq;
};
struct nft_rhash_elem {
+ struct nft_elem_priv priv;
struct rhash_head node;
+ struct llist_node walk_node;
+ u32 wq_gc_seq;
struct nft_set_ext ext;
};
@@ -35,6 +39,7 @@ struct nft_rhash_cmp_arg {
const struct nft_set *set;
const u32 *key;
u8 genmask;
+ u64 tstamp;
};
static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed)
@@ -59,7 +64,9 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
- if (nft_set_elem_expired(&he->ext))
+ if (nft_set_elem_is_dead(&he->ext))
+ return 1;
+ if (__nft_set_elem_expired(&he->ext, x->tstamp))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
return 1;
@@ -74,8 +81,10 @@ static const struct rhashtable_params nft_rhash_params = {
.automatic_shrinking = true,
};
-static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rhash *priv = nft_set_priv(set);
const struct nft_rhash_elem *he;
@@ -83,17 +92,19 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- *ext = &he->ext;
+ return &he->ext;
- return !!he;
+ return NULL;
}
-static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rhash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -101,39 +112,40 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- return he;
+ return &he->priv;
return ERR_PTR(-ENOENT);
}
-static bool nft_rhash_update(struct nft_set *set, const u32 *key,
- void *(*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *regs),
- const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+nft_rhash_update(struct nft_set *set, const u32 *key,
+ const struct nft_expr *expr, struct nft_regs *regs)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he, *prev;
+ struct nft_elem_priv *elem_priv;
struct nft_rhash_cmp_arg arg = {
.genmask = NFT_GENMASK_ANY,
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
goto out;
- he = new(set, expr, regs);
- if (he == NULL)
+ elem_priv = nft_dynset_new(set, expr, regs);
+ if (!elem_priv)
goto err1;
+ he = nft_elem_priv_cast(elem_priv);
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -141,69 +153,67 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
+ atomic_dec(&set->nelems);
he = prev;
}
out:
- *ext = &he->ext;
- return true;
+ return &he->ext;
err2:
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
+ atomic_dec(&set->nelems);
err1:
- return false;
+ return NULL;
}
static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
struct nft_rhash_cmp_arg arg = {
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
struct nft_rhash_elem *prev;
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
return PTR_ERR(prev);
if (prev) {
- *ext = &prev->ext;
+ *elem_priv = &prev->priv;
return -EEXIST;
}
return 0;
}
static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = elem->priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_rhash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rhash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext)) {
- nft_set_elem_change_active(net, set, &he->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &he->ext);
}
-static void *nft_rhash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rhash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -211,25 +221,25 @@ static void *nft_rhash_deactivate(const struct net *net,
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
- if (he != NULL &&
- !nft_rhash_flush(net, set, he))
- he = NULL;
+ if (he)
+ nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
- return he;
+ return &he->priv;
}
static void nft_rhash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
}
@@ -249,16 +259,17 @@ static bool nft_rhash_delete(const struct nft_set *set,
if (he == NULL)
return false;
- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+ nft_set_elem_dead(&he->ext);
+
+ return true;
}
-static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he;
struct rhashtable_iter hti;
- struct nft_set_elem elem;
+ struct nft_rhash_elem *he;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
@@ -275,14 +286,8 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&he->ext))
- goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
-
- elem.priv = he;
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
break;
@@ -293,51 +298,199 @@ cont:
rhashtable_walk_exit(&hti);
}
+static void nft_rhash_walk_update(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_elem *he, *tmp;
+ struct llist_node *first_node;
+ struct rhashtable_iter hti;
+ LLIST_HEAD(walk_list);
+
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (set->in_update_walk) {
+ /* This can happen with bogus rulesets during ruleset validation
+ * when a verdict map causes a jump back to the same map.
+ *
+ * Without this extra check the walk_next loop below will see
+ * elems on the callers walk_list and skip (not validate) them.
+ */
+ iter->err = -EMLINK;
+ return;
+ }
+
+ /* walk happens under RCU.
+ *
+ * We create a snapshot list so ->iter callback can sleep.
+ * commit_mutex is held, elements can ...
+ * .. be added in parallel from dataplane (dynset)
+ * .. be marked as dead in parallel from dataplane (dynset).
+ * .. be queued for removal in parallel (gc timeout).
+ * .. not be freed: transaction mutex is held.
+ */
+ rhashtable_walk_enter(&priv->ht, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
+ }
+
+ continue;
+ }
+
+ /* rhashtable resized during walk, skip */
+ if (llist_on_list(&he->walk_node))
+ continue;
+
+ llist_add(&he->walk_node, &walk_list);
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ first_node = __llist_del_all(&walk_list);
+ set->in_update_walk = true;
+ llist_for_each_entry_safe(he, tmp, first_node, walk_node) {
+ if (iter->err == 0) {
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
+ if (iter->err == 0)
+ iter->count++;
+ }
+
+ /* all entries must be cleared again, else next ->walk iteration
+ * will skip entries.
+ */
+ init_llist_node(&he->walk_node);
+ }
+ set->in_update_walk = false;
+}
+
+static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ /* only relevant for netlink dumps which use READ type */
+ WARN_ON_ONCE(iter->skip != 0);
+
+ nft_rhash_walk_update(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ nft_rhash_walk_ro(ctx, set, iter);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
+ struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr) &&
+ set->flags & NFT_SET_EVAL)
+ return true;
+ }
+
+ return false;
+}
+
static void nft_rhash_gc(struct work_struct *work)
{
+ struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
- struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
+ struct nft_trans_gc *gc;
+ struct net *net;
+ u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ if (nft_set_gc_is_pending(set))
+ goto done;
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
+
+ /* Elements never collected use a zero gc worker sequence number. */
+ if (unlikely(++priv->wq_gc_seq == 0))
+ priv->wq_gc_seq++;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- break;
- continue;
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
}
- if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(&he->ext);
-
- if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
- goto gc;
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
}
- if (!nft_set_elem_expired(&he->ext))
- continue;
-gc:
- if (nft_set_elem_mark_busy(&he->ext))
+
+ /* rhashtable walk is unstable, already seen in this gc run?
+ * Then, skip this element. In case of (unlikely) sequence
+ * wraparound and stale element wq_gc_seq, next gc run will
+ * just find this expired element.
+ */
+ if (he->wq_gc_seq == priv->wq_gc_seq)
continue;
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- break;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
+ if (nft_set_elem_is_dead(&he->ext))
+ goto dead_elem;
+
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_rhash_expr_needs_gc_run(set, &he->ext))
+ goto needs_gc_run;
+
+ if (!nft_set_elem_expired(&he->ext))
+ continue;
+needs_gc_run:
+ nft_set_elem_dead(&he->ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
+
+ /* annotate gc sequence for this attempt. */
+ he->wq_gc_seq = priv->wq_gc_seq;
+ nft_trans_gc_elem_add(gc, he);
}
+
+ gc = nft_trans_gc_catchall_async(gc, gc_seq);
+
+try_later:
+ /* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
- nft_set_gc_batch_complete(gcb);
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@@ -364,6 +517,8 @@ static int nft_rhash_init(const struct nft_set *set,
struct rhashtable_params params = nft_rhash_params;
int err;
+ BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0);
+
params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT;
params.key_len = set->klen;
@@ -372,30 +527,50 @@ static int nft_rhash_init(const struct nft_set *set,
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
+ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
}
+struct nft_rhash_ctx {
+ const struct nft_ctx ctx;
+ const struct nft_set *set;
+};
+
static void nft_rhash_elem_destroy(void *ptr, void *arg)
{
- nft_set_elem_destroy(arg, ptr, true);
+ struct nft_rhash_ctx *rhash_ctx = arg;
+ struct nft_rhash_elem *he = ptr;
+
+ nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv);
}
-static void nft_rhash_destroy(const struct nft_set *set)
+static void nft_rhash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_ctx rhash_ctx = {
+ .ctx = *ctx,
+ .set = set,
+ };
cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
- (void *)set);
+ (void *)&rhash_ctx);
}
+/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
+#define NFT_MAX_BUCKETS (1U << 31)
+
static u32 nft_hash_buckets(u32 size)
{
- return roundup_pow_of_two(size * 4 / 3);
+ u64 val = div_u64((u64)size * 4, 3);
+
+ if (val >= NFT_MAX_BUCKETS)
+ return NFT_MAX_BUCKETS;
+
+ return roundup_pow_of_two(val);
}
static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features,
@@ -415,12 +590,15 @@ struct nft_hash {
};
struct nft_hash_elem {
+ struct nft_elem_priv priv;
struct hlist_node node;
struct nft_set_ext ext;
};
-static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -431,16 +609,15 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
-static void *nft_hash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_hash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -452,14 +629,15 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
nft_set_elem_active(&he->ext, genmask))
- return he;
+ return &he->priv;
}
return ERR_PTR(-ENOENT);
}
-static bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -472,12 +650,10 @@ static bool nft_hash_lookup_fast(const struct net *net,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
k2 = *(u32 *)nft_set_ext_key(&he->ext)->data;
if (k1 == k2 &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
@@ -499,9 +675,9 @@ static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
static int nft_hash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
- struct nft_hash_elem *this = elem->priv, *he;
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
u32 hash;
@@ -511,7 +687,7 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
if (!memcmp(nft_set_ext_key(&this->ext),
nft_set_ext_key(&he->ext), set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
+ *elem_priv = &he->priv;
return -EEXIST;
}
}
@@ -520,28 +696,28 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
}
static void nft_hash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_hash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_hash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
nft_set_elem_change_active(net, set, &he->ext);
- return true;
}
-static void *nft_hash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_hash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *this = elem->priv, *he;
u8 genmask = nft_genmask_next(net);
u32 hash;
@@ -551,7 +727,7 @@ static void *nft_hash_deactivate(const struct net *net,
set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
nft_set_elem_change_active(net, set, &he->ext);
- return he;
+ return &he->priv;
}
}
return NULL;
@@ -559,9 +735,9 @@ static void *nft_hash_deactivate(const struct net *net,
static void nft_hash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
hlist_del_rcu(&he->node);
}
@@ -571,19 +747,15 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
- struct nft_set_elem elem;
int i;
for (i = 0; i < priv->buckets; i++) {
- hlist_for_each_entry_rcu(he, &priv->table[i], node) {
+ hlist_for_each_entry_rcu(he, &priv->table[i], node,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
-
- elem.priv = he;
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
return;
cont:
@@ -596,7 +768,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
}
static int nft_hash_init(const struct nft_set *set,
@@ -611,7 +783,8 @@ static int nft_hash_init(const struct nft_set *set,
return 0;
}
-static void nft_hash_destroy(const struct nft_set *set)
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
@@ -621,7 +794,7 @@ static void nft_hash_destroy(const struct nft_set *set)
for (i = 0; i < priv->buckets; i++) {
hlist_for_each_entry_safe(he, next, &priv->table[i], node) {
hlist_del_rcu(&he->node);
- nft_set_elem_destroy(set, he, true);
+ nf_tables_set_elem_destroy(ctx, set, &he->priv);
}
}
}
@@ -636,8 +809,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
@@ -654,8 +827,8 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 8b5acc6910fd..112fe46788b6 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -312,7 +312,7 @@
* Jay Ligatti, Josh Kuhn, and Chris Gage.
* Proceedings of the IEEE International Conference on Computer
* Communication Networks (ICCCN), August 2010.
- * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
+ * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
*
* [Rottenstreich 2010]
* Worst-Case TCAM Rule Expansion
@@ -325,7 +325,7 @@
* Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
* and Patrick Eugster.
* Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
- * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
+ * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
*/
#include <linux/kernel.h>
@@ -342,9 +342,6 @@
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
-/* Current working bitmap index, toggled between field matches */
-static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
-
/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
@@ -362,11 +359,13 @@ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
*
* Return: -1 on no match, bit position on 'match_only', 0 otherwise.
*/
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
- union nft_pipapo_map_bucket *mt, bool match_only)
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only)
{
unsigned long bitset;
- int k, ret = -1;
+ unsigned int k;
+ int ret = -1;
for (k = 0; k < len; k++) {
bitset = map[k];
@@ -398,41 +397,47 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
}
/**
- * nft_pipapo_lookup() - Lookup function
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- * @ext: nftables API extension pointer, filled with matching reference
+ * pipapo_get_slow() - Get matching element reference given key data
+ * @m: storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation.
*
- * Return: true on match, false otherwise.
+ * This is the main lookup function. It matches key data against either
+ * the working match set or the uncommitted copy, depending on what the
+ * caller passed to us.
+ * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup
+ * (datapath lookup) pass the active copy.
+ * The insertion path will pass the uncommitted working copy.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- unsigned long *res_map, *fill_map;
- u8 genmask = nft_genmask_cur(net);
- const u8 *rp = (const u8 *)key;
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
+ unsigned long *res_map, *fill_map, *map;
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
bool map_index;
int i;
local_bh_disable();
- map_index = raw_cpu_read(nft_pipapo_scratch_index);
-
- m = rcu_dereference(priv->match);
-
- if (unlikely(!m || !*raw_cpu_ptr(m->scratch)))
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
goto out;
+ __local_lock_nested_bh(&scratch->bh_lock);
+
+ map_index = scratch->map_index;
- res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0);
- fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max);
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res_map = map + (map_index ? m->bsize_max : 0);
+ fill_map = map + (map_index ? 0 : m->bsize_max);
- memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+ pipapo_resmap_init(m, res_map);
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
@@ -442,12 +447,12 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
* packet bytes value, then AND bucket value
*/
if (likely(f->bb == 8))
- pipapo_and_field_buckets_8bit(f, res_map, rp);
+ pipapo_and_field_buckets_8bit(f, res_map, data);
else
- pipapo_and_field_buckets_4bit(f, res_map, rp);
+ pipapo_and_field_buckets_4bit(f, res_map, data);
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
- rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -460,16 +465,19 @@ next_match:
b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
last);
if (b < 0) {
- raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
- return false;
+ return NULL;
}
if (last) {
- *ext = &f->mt[b].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask)))
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[b].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask)))
goto next_match;
/* Last field: we're just returning the key without
@@ -477,10 +485,10 @@ next_match:
* current inactive bitmap is clean and can be reused as
* *next* bitmap (not initial) for the next packet.
*/
- raw_cpu_write(nft_pipapo_scratch_index, map_index);
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
-
- return true;
+ return e;
}
/* Swap bitmap indices: res_map is the initial bitmap for the
@@ -490,119 +498,202 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
}
+ __local_unlock_nested_bh(&scratch->bh_lock);
out:
local_bh_enable();
- return false;
+ return NULL;
}
/**
* pipapo_get() - Get matching element reference given key data
- * @net: Network namespace
- * @set: nftables API set representation
+ * @m: Storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
- * This is essentially the same as the lookup function, except that it matches
- * key data against the uncommitted copy and doesn't use preallocated maps for
- * bitmap results.
+ * This is a dispatcher function, either calling out the generic C
+ * implementation or, if available, the AVX2 one.
+ * This helper is only called from the control plane, with either RCU
+ * read lock or transaction mutex held.
*
- * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise.
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-static struct nft_pipapo_elem *pipapo_get(const struct net *net,
- const struct nft_set *set,
- const u8 *data, u8 genmask)
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m = priv->clone;
- unsigned long *res_map, *fill_map = NULL;
- struct nft_pipapo_field *f;
- int i;
+ struct nft_pipapo_elem *e;
- res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
- if (!res_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
- }
+ local_bh_disable();
- fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC);
- if (!fill_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) {
+ e = pipapo_get_avx2(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
}
+#endif
+ e = pipapo_get_slow(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+}
- memset(res_map, 0xff, m->bsize_max * sizeof(*res_map));
+/**
+ * nft_pipapo_lookup() - Dataplane fronted for main lookup function
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: pointer to nft registers containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy.
+ * Unlike other set types, this uses 0 instead of nft_genmask_cur().
+ *
+ * This is because new (future) elements are not reachable from
+ * priv->match, they get added to priv->clone instead.
+ * When the commit phase flips the generation bitmask, the
+ * 'now old' entries are skipped but without the 'now current'
+ * elements becoming visible. Using nft_genmask_cur() thus creates
+ * inconsistent state: matching old entries get skipped but thew
+ * newly matching entries are unreachable.
+ *
+ * GENMASK_ANY doesn't work for the same reason: old-gen entries get
+ * skipped, new-gen entries are only reachable from priv->clone.
+ *
+ * nft_pipapo_commit swaps ->clone and ->match shortly after the
+ * genbit flip. As ->clone doesn't contain the old entries in the first
+ * place, lookup will only find the now-current ones.
+ *
+ * Return: ntables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const struct nft_pipapo_elem *e;
- nft_pipapo_for_each_field(f, i, m) {
- bool last = i == m->field_count - 1;
- int b;
+ m = rcu_dereference(priv->match);
+ e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64());
- /* For each bit group: select lookup table bucket depending on
- * packet bytes value, then AND bucket value
- */
- if (f->bb == 8)
- pipapo_and_field_buckets_8bit(f, res_map, data);
- else if (f->bb == 4)
- pipapo_and_field_buckets_4bit(f, res_map, data);
- else
- BUG();
+ return e ? &e->ext : NULL;
+}
- data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+/**
+ * nft_pipapo_get() - Get matching element reference given key data
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @flags: Unused
+ *
+ * This function is called from the control plane path under
+ * RCU read lock.
+ *
+ * Return: set element private pointer or ERR_PTR(-ENOENT).
+ */
+static struct nft_elem_priv *
+nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = rcu_dereference(priv->match);
+ struct nft_pipapo_elem *e;
- /* Now populate the bitmap for the next field, unless this is
- * the last field, in which case return the matched 'ext'
- * pointer if any.
- *
- * Now res_map contains the matching bitmap, and fill_map is the
- * bitmap for the next field.
- */
-next_match:
- b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
- last);
- if (b < 0)
- goto out;
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net), get_jiffies_64());
+ if (!e)
+ return ERR_PTR(-ENOENT);
- if (last) {
- if (nft_set_elem_expired(&f->mt[b].e->ext) ||
- (genmask &&
- !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
- goto next_match;
+ return &e->priv;
+}
- ret = f->mt[b].e;
- goto out;
- }
+/**
+ * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
+ * @f: Field containing mapping table
+ * @old_rules: Amount of existing mapped rules
+ * @rules: Amount of new rules to map
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int pipapo_realloc_mt(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
+{
+ union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
+ const unsigned int extra = PAGE_SIZE / sizeof(*new_mt);
+ unsigned int rules_alloc = rules;
- data += NFT_PIPAPO_GROUPS_PADDING(f);
+ might_sleep();
- /* Swap bitmap indices: fill_map will be the initial bitmap for
- * the next field (i.e. the new res_map), and res_map is
- * guaranteed to be all-zeroes at this point, ready to be filled
- * according to the next mapping table.
- */
- swap(res_map, fill_map);
+ if (unlikely(rules == 0))
+ goto out_free;
+
+ /* growing and enough space left, no action needed */
+ if (rules > old_rules && f->rules_alloc > rules)
+ return 0;
+
+ /* downsize and extra slack has not grown too large */
+ if (rules < old_rules) {
+ unsigned int remove = f->rules_alloc - rules;
+
+ if (remove < (2u * extra))
+ return 0;
}
-out:
- kfree(fill_map);
- kfree(res_map);
- return ret;
+ /* If set needs more than one page of memory for rules then
+ * allocate another extra page to avoid frequent reallocation.
+ */
+ if (rules > extra &&
+ check_add_overflow(rules, extra, &rules_alloc))
+ return -EOVERFLOW;
+
+ if (rules_alloc > (INT_MAX / sizeof(*new_mt)))
+ return -ENOMEM;
+
+ new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT);
+ if (!new_mt)
+ return -ENOMEM;
+
+ if (old_mt)
+ memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
+
+ if (rules > old_rules) {
+ memset(new_mt + old_rules, 0,
+ (rules - old_rules) * sizeof(*new_mt));
+ }
+out_free:
+ f->rules_alloc = rules_alloc;
+ f->mt = new_mt;
+
+ kvfree(old_mt);
+
+ return 0;
}
+
/**
- * nft_pipapo_get() - Get matching element reference given key data
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- * @flags: Unused
+ * lt_calculate_size() - Get storage size for lookup table with overflow check
+ * @groups: Amount of bit groups
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @bsize: Size of each bucket in lookup table, in longs
+ *
+ * Return: allocation size including alignment overhead, negative on overflow
*/
-static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb,
+ unsigned int bsize)
{
- return pipapo_get(net, set, (const u8 *)elem->key.val.data,
- nft_genmask_cur(net));
+ ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long);
+
+ if (check_mul_overflow(ret, bsize, &ret))
+ return -1;
+ if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret))
+ return -1;
+ if (ret > INT_MAX)
+ return -1;
+
+ return ret;
}
/**
@@ -617,12 +708,16 @@ static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
*
* Return: 0 on success, -ENOMEM on allocation failure.
*/
-static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
+static int pipapo_resize(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
{
long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
- union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt;
- size_t new_bucket_size, copy;
- int group, bucket;
+ unsigned int new_bucket_size, copy;
+ int group, bucket, err;
+ ssize_t lt_size;
+
+ if (rules >= NFT_PIPAPO_RULE0_MAX)
+ return -ENOSPC;
new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
#ifdef NFT_PIPAPO_ALIGN
@@ -638,10 +733,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
- new_bucket_size * sizeof(*new_lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size);
+ if (lt_size < 0)
+ return -ENOMEM;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return -ENOMEM;
@@ -662,27 +758,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
}
mt:
- new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL);
- if (!new_mt) {
+ err = pipapo_realloc_mt(f, old_rules, rules);
+ if (err) {
kvfree(new_lt);
- return -ENOMEM;
- }
-
- memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt));
- if (rules > old_rules) {
- memset(new_mt + old_rules, 0,
- (rules - old_rules) * sizeof(*new_mt));
+ return err;
}
if (new_lt) {
f->bsize = new_bucket_size;
- NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+ f->lt = new_lt;
kvfree(old_lt);
}
- f->mt = new_mt;
- kvfree(old_mt);
-
return 0;
}
@@ -833,9 +920,9 @@ static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
*/
static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
{
+ unsigned int groups, bb;
unsigned long *new_lt;
- int groups, bb;
- size_t lt_size;
+ ssize_t lt_size;
lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
sizeof(*f->lt);
@@ -845,15 +932,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
groups = f->groups * 2;
bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
} else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
groups = f->groups / 2;
bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
- lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
- sizeof(*f->lt);
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
/* Don't increase group width if the resulting lookup table size
* would exceed the upper size threshold for a "small" set.
@@ -864,7 +953,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
return;
}
- new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
return;
@@ -884,7 +973,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
f->groups = groups;
f->bb = bb;
kvfree(f->lt);
- NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+ f->lt = new_lt;
}
/**
@@ -901,12 +990,14 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret, bit_offset = 0;
+ unsigned int rule = f->rules, group, ret, bit_offset = 0;
- ret = pipapo_resize(f, f->rules - 1, f->rules);
+ ret = pipapo_resize(f, f->rules, f->rules + 1);
if (ret)
return ret;
+ f->rules++;
+
for (group = 0; group < f->groups; group++) {
int i, v;
u8 mask;
@@ -1051,7 +1142,9 @@ static int pipapo_expand(struct nft_pipapo_field *f,
step++;
if (step >= len) {
if (!masks) {
- pipapo_insert(f, base, 0);
+ err = pipapo_insert(f, base, 0);
+ if (err < 0)
+ return err;
masks = 1;
}
goto out;
@@ -1075,7 +1168,7 @@ out:
* @m: Matching data, including mapping table
* @map: Table of rule maps: array of first rule and amount of rules
* in next field a given rule maps to, for each field
- * @ext: For last field, nft_set_ext pointer matching rules map to
+ * @e: For last field, nft_set_ext pointer matching rules map to
*/
static void pipapo_map(struct nft_pipapo_match *m,
union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS],
@@ -1097,9 +1190,23 @@ static void pipapo_map(struct nft_pipapo_match *m,
}
/**
+ * pipapo_free_scratch() - Free per-CPU map at original address
+ * @m: Matching data
+ * @cpu: CPU number
+ */
+static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
+{
+ struct nft_pipapo_scratch *s;
+
+ s = *per_cpu_ptr(m->scratch, cpu);
+
+ kvfree(s);
+}
+
+/**
* pipapo_realloc_scratch() - Reallocate scratch maps for partial match results
* @clone: Copy of matching data with pending insertions and deletions
- * @bsize_max Maximum bucket size, scratch maps cover two buckets
+ * @bsize_max: Maximum bucket size, scratch maps cover two buckets
*
* Return: 0 on success, -ENOMEM on failure.
*/
@@ -1109,14 +1216,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
int i;
for_each_possible_cpu(i) {
- unsigned long *scratch;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long *scratch_aligned;
-#endif
+ struct nft_pipapo_scratch *scratch;
- scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL, cpu_to_node(i));
+ scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
* allocations: this means that some scratch maps have
@@ -1128,49 +1232,82 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
return -ENOMEM;
}
- kfree(*per_cpu_ptr(clone->scratch, i));
-
+ pipapo_free_scratch(clone, i);
+ local_lock_init(&scratch->bh_lock);
*per_cpu_ptr(clone->scratch, i) = scratch;
-
-#ifdef NFT_PIPAPO_ALIGN
- scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch);
- *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned;
-#endif
}
return 0;
}
+static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ const struct net *net = read_pnet(&set->net);
+
+ return lockdep_is_held(&nft_pernet(net)->commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old);
+
+/**
+ * pipapo_maybe_clone() - Build clone for pending data changes, if not existing
+ * @set: nftables API set representation
+ *
+ * Return: newly created or existing clone, if any. NULL on allocation failure
+ */
+static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+
+ if (priv->clone)
+ return priv->clone;
+
+ m = rcu_dereference_protected(priv->match,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = pipapo_clone(m);
+
+ return priv->clone;
+}
+
/**
* nft_pipapo_insert() - Validate and insert ranged elements
* @net: Network namespace
* @set: nftables API set representation
* @elem: nftables API element representation containing key data
- * @ext2: Filled with pointer to &struct nft_set_ext in inserted element
+ * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element
*
* Return: 0 on success, error pointer on failure.
*/
static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext2)
+ struct nft_elem_priv **elem_priv)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const u8 *start = (const u8 *)elem->key.val.data, *end;
- struct nft_pipapo_elem *e = elem->priv, *dup;
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m = priv->clone;
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
u8 genmask = nft_genmask_next(net);
+ struct nft_pipapo_elem *e, *dup;
+ u64 tstamp = nft_net_tstamp(net);
struct nft_pipapo_field *f;
+ const u8 *start_p, *end_p;
int i, bsize_max, err = 0;
+ if (!m)
+ return -ENOMEM;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
end = (const u8 *)nft_set_ext_key_end(ext)->data;
else
end = start;
- dup = pipapo_get(net, set, start, genmask);
- if (!IS_ERR(dup)) {
+ dup = pipapo_get(m, start, genmask, tstamp);
+ if (dup) {
/* Check if we already have the same exact entry */
const struct nft_data *dup_key, *dup_end;
@@ -1182,30 +1319,31 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
!memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
- *ext2 = &dup->ext;
+ *elem_priv = &dup->priv;
return -EEXIST;
}
return -ENOTEMPTY;
}
- if (PTR_ERR(dup) == -ENOENT) {
- /* Look for partially overlapping entries */
- dup = pipapo_get(net, set, end, nft_genmask_next(net));
- }
-
- if (PTR_ERR(dup) != -ENOENT) {
- if (IS_ERR(dup))
- return PTR_ERR(dup);
- *ext2 = &dup->ext;
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(m, end, nft_genmask_next(net), tstamp);
+ if (dup) {
+ *elem_priv = &dup->priv;
return -ENOTEMPTY;
}
/* Validate */
- nft_pipapo_for_each_field(f, i, m) {
- const u8 *start_p = start, *end_p = end;
+ start_p = start;
+ end_p = end;
+
+ /* some helpers return -1, or 0 >= for valid rule pos,
+ * so we cannot support more than INT_MAX rules at this time.
+ */
+ BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX);
- if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX)
+ nft_pipapo_for_each_field(f, i, m) {
+ if (f->rules >= NFT_PIPAPO_RULE0_MAX)
return -ENOSPC;
if (memcmp(start_p, end_p,
@@ -1217,8 +1355,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
}
/* Insert */
- priv->dirty = true;
-
bsize_max = m->bsize_max;
nft_pipapo_for_each_field(f, i, m) {
@@ -1233,6 +1369,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
ret = pipapo_expand(f, start, end, f->groups * f->bb);
+ if (ret < 0)
+ return ret;
+
if (f->bsize > bsize_max)
bsize_max = f->bsize;
@@ -1242,17 +1381,20 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
- if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ put_cpu_ptr(m->scratch);
+
err = pipapo_realloc_scratch(m, bsize_max);
if (err)
return err;
- this_cpu_write(nft_pipapo_scratch_index, false);
-
m->bsize_max = bsize_max;
+ } else {
+ put_cpu_ptr(m->scratch);
}
- *ext2 = &e->ext;
+ e = nft_elem_priv_cast(elem->priv);
+ *elem_priv = &e->priv;
pipapo_map(m, rulemap, e);
@@ -1263,7 +1405,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
* pipapo_clone() - Clone matching data to create new working copy
* @old: Existing matching data
*
- * Return: copy of matching data passed as 'old', error pointer on failure
+ * Return: copy of matching data passed as 'old' or NULL.
*/
static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
{
@@ -1271,10 +1413,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
struct nft_pipapo_match *new;
int i;
- new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count,
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT);
if (!new)
- return ERR_PTR(-ENOMEM);
+ return NULL;
new->field_count = old->field_count;
new->bsize_max = old->bsize_max;
@@ -1283,11 +1424,11 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch)
goto out_scratch;
-#ifdef NFT_PIPAPO_ALIGN
- new->scratch_aligned = alloc_percpu(*new->scratch_aligned);
- if (!new->scratch_aligned)
- goto out_scratch;
-#endif
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(new->scratch, i) = NULL;
+
+ if (pipapo_realloc_scratch(new, old->bsize_max))
+ goto out_scratch_realloc;
rcu_head_init(&new->rcu);
@@ -1296,28 +1437,41 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
for (i = 0; i < old->field_count; i++) {
unsigned long *new_lt;
+ ssize_t lt_size;
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
- src->bsize * sizeof(*dst->lt) +
- NFT_PIPAPO_ALIGN_HEADROOM,
- GFP_KERNEL);
+ lt_size = lt_calculate_size(src->groups, src->bb, src->bsize);
+ if (lt_size < 0)
+ goto out_lt;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
if (!new_lt)
goto out_lt;
- NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+ dst->lt = new_lt;
memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
NFT_PIPAPO_LT_ALIGN(src->lt),
src->bsize * sizeof(*dst->lt) *
src->groups * NFT_PIPAPO_BUCKETS(src->bb));
- dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
- if (!dst->mt)
- goto out_mt;
+ if (src->rules > 0) {
+ if (src->rules_alloc > (INT_MAX / sizeof(*src->mt)))
+ goto out_mt;
+
+ dst->mt = kvmalloc_array(src->rules_alloc,
+ sizeof(*src->mt),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst->mt)
+ goto out_mt;
+
+ memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+ } else {
+ dst->mt = NULL;
+ dst->rules_alloc = 0;
+ }
- memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
src++;
dst++;
}
@@ -1332,14 +1486,14 @@ out_lt:
kvfree(dst->lt);
dst--;
}
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(new->scratch_aligned);
-#endif
+out_scratch_realloc:
+ for_each_possible_cpu(i)
+ pipapo_free_scratch(new, i);
out_scratch:
free_percpu(new->scratch);
kfree(new);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
/**
@@ -1371,10 +1525,10 @@ out_scratch:
*
* Return: Number of rules that originated from the same entry as @first.
*/
-static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
+static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first)
{
struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
- int r;
+ unsigned int r;
for (r = first; r < f->rules; r++) {
if (r != first && e != f->mt[r].e)
@@ -1427,8 +1581,9 @@ static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first)
* 0 1 2
* element pointers: 0x42 0x42 0x44
*/
-static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
- int start, int n, int to_offset, bool is_last)
+static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
+ unsigned int start, unsigned int n,
+ unsigned int to_offset, bool is_last)
{
int i;
@@ -1445,7 +1600,7 @@ static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules,
/**
* pipapo_drop() - Delete entry from lookup and mapping tables, given rule map
* @m: Matching data
- * @rulemap Table of rule maps, arrays of first rule and amount of rules
+ * @rulemap: Table of rule maps, arrays of first rule and amount of rules
* in next field a given entry maps to, for each field
*
* For each rule in lookup table buckets mapping to this set of rules, drop
@@ -1518,21 +1673,35 @@ static void pipapo_drop(struct nft_pipapo_match *m,
}
}
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+ struct nft_pipapo_elem *e)
+
+{
+ nft_setelem_data_deactivate(net, set, &e->priv);
+}
+
/**
* pipapo_gc() - Drop expired entries from set, destroy start and end elements
* @set: nftables API set representation
* @m: Matching data
*/
-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
{
struct nft_pipapo *priv = nft_set_priv(set);
- int rules_f0, first_rule = 0;
+ struct net *net = read_pnet(&set->net);
+ unsigned int rules_f0, first_rule = 0;
+ u64 tstamp = nft_net_tstamp(net);
+ struct nft_pipapo_elem *e;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
- struct nft_pipapo_field *f;
- struct nft_pipapo_elem *e;
- int i, start, rules_fx;
+ const struct nft_pipapo_field *f;
+ unsigned int i, start, rules_fx;
start = first_rule;
rules_fx = rules_f0;
@@ -1551,13 +1720,18 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
f--;
i--;
e = f->mt[rulemap[i].to].e;
- if (nft_set_elem_expired(&e->ext) &&
- !nft_set_elem_mark_busy(&e->ext)) {
- priv->dirty = true;
- pipapo_drop(m, rulemap);
- rcu_barrier();
- nft_set_elem_destroy(set, e, true);
+ /* synchronous gc never fails, there is no need to set on
+ * NFT_SET_ELEM_DEAD_BIT.
+ */
+ if (__nft_set_elem_expired(&e->ext, tstamp)) {
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ nft_pipapo_gc_deactivate(net, set, e);
+ pipapo_drop(m, rulemap);
+ nft_trans_gc_elem_add(gc, e);
/* And check again current first rule, which is now the
* first we haven't checked.
@@ -1567,7 +1741,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
}
}
- priv->last_gc = jiffies;
+ gc = nft_trans_gc_catchall_sync(gc);
+ if (gc) {
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
/**
@@ -1585,32 +1763,33 @@ static void pipapo_free_fields(struct nft_pipapo_match *m)
}
}
-/**
- * pipapo_reclaim_match - RCU callback to free fields from old matching data
- * @rcu: RCU head
- */
-static void pipapo_reclaim_match(struct rcu_head *rcu)
+static void pipapo_free_match(struct nft_pipapo_match *m)
{
- struct nft_pipapo_match *m;
int i;
- m = container_of(rcu, struct nft_pipapo_match, rcu);
-
for_each_possible_cpu(i)
- kfree(*per_cpu_ptr(m->scratch, i));
+ pipapo_free_scratch(m, i);
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
free_percpu(m->scratch);
-
pipapo_free_fields(m);
kfree(m);
}
/**
- * pipapo_commit() - Replace lookup data with current working copy
+ * pipapo_reclaim_match - RCU callback to free fields from old matching data
+ * @rcu: RCU head
+ */
+static void pipapo_reclaim_match(struct rcu_head *rcu)
+{
+ struct nft_pipapo_match *m;
+
+ m = container_of(rcu, struct nft_pipapo_match, rcu);
+ pipapo_free_match(m);
+}
+
+/**
+ * nft_pipapo_commit() - Replace lookup data with current working copy
* @set: nftables API set representation
*
* While at it, check if we should perform garbage collection on the working
@@ -1620,108 +1799,91 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
* We also need to create a new working copy for subsequent insertions and
* deletions.
*/
-static void pipapo_commit(const struct nft_set *set)
+static void nft_pipapo_commit(struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *new_clone, *old;
-
- if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- pipapo_gc(set, priv->clone);
+ struct nft_pipapo_match *old;
- if (!priv->dirty)
+ if (!priv->clone)
return;
- new_clone = pipapo_clone(priv->clone);
- if (IS_ERR(new_clone))
- return;
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ pipapo_gc(set, priv->clone);
- priv->dirty = false;
+ old = rcu_replace_pointer(priv->match, priv->clone,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = NULL;
- old = rcu_access_pointer(priv->match);
- rcu_assign_pointer(priv->match, priv->clone);
if (old)
call_rcu(&old->rcu, pipapo_reclaim_match);
+}
+
+static void nft_pipapo_abort(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
- priv->clone = new_clone;
+ if (!priv->clone)
+ return;
+ pipapo_free_match(priv->clone);
+ priv->clone = NULL;
}
/**
* nft_pipapo_activate() - Mark element reference as active given key, commit
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* On insertion, elements are added to a copy of the matching data currently
- * in use for lookups, and not directly inserted into current lookup data, so
- * we'll take care of that by calling pipapo_commit() here. Both
+ * in use for lookups, and not directly inserted into current lookup data. Both
* nft_pipapo_insert() and nft_pipapo_activate() are called once for each
* element, hence we can't purpose either one as a real commit operation.
*/
static void nft_pipapo_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_pipapo_elem *e;
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
- e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0);
- if (IS_ERR(e))
- return;
-
- nft_set_elem_change_active(net, set, &e->ext);
- nft_set_elem_clear_busy(&e->ext);
-
- pipapo_commit(set);
+ nft_clear(net, &e->ext);
}
/**
- * pipapo_deactivate() - Check that element is in set, mark as inactive
+ * nft_pipapo_deactivate() - Search for element and make it inactive
* @net: Network namespace
* @set: nftables API set representation
- * @data: Input key data
- * @ext: nftables API extension pointer, used to check for end element
- *
- * This is a convenience function that can be called from both
- * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same
- * operation.
+ * @elem: nftables API element representation containing key data
*
* Return: deactivated element if found, NULL otherwise.
*/
-static void *pipapo_deactivate(const struct net *net, const struct nft_set *set,
- const u8 *data, const struct nft_set_ext *ext)
+static struct nft_elem_priv *
+nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
struct nft_pipapo_elem *e;
- e = pipapo_get(net, set, data, nft_genmask_next(net));
- if (IS_ERR(e))
+ /* removal must occur on priv->clone, if we are low on memory
+ * we have no choice and must fail the removal request.
+ */
+ if (!m)
return NULL;
- nft_set_elem_change_active(net, set, &e->ext);
-
- return e;
-}
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_next(net), nft_net_tstamp(net));
+ if (!e)
+ return NULL;
-/**
- * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- *
- * Return: deactivated element if found, NULL otherwise.
- */
-static void *nft_pipapo_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ nft_set_elem_change_active(net, set, &e->ext);
- return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext);
+ return &e->priv;
}
/**
- * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive
+ * nft_pipapo_flush() - make element inactive
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* This is functionally the same as nft_pipapo_deactivate(), with a slightly
* different interface, and it's also called once for each element in a set
@@ -1735,13 +1897,12 @@ static void *nft_pipapo_deactivate(const struct net *net,
*
* Return: true if element was found and deactivated.
*/
-static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
- void *elem)
+static void nft_pipapo_flush(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_pipapo_elem *e = elem;
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
- return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext),
- &e->ext);
+ nft_set_elem_change_active(net, set, &e->ext);
}
/**
@@ -1864,7 +2025,7 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
* nft_pipapo_remove() - Remove element given key, commit
* @net: Network namespace
* @set: nftables API set representation
- * @elem: nftables API element representation containing key data
+ * @elem_priv: nftables API element representation containing key data
*
* Similarly to nft_pipapo_activate(), this is used as commit operation by the
* API, but it's called once per element in the pending transaction, so we can't
@@ -1872,20 +2033,17 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
* the matched element here, if any, and commit the updated matching data.
*/
static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m = priv->clone;
- struct nft_pipapo_elem *e = elem->priv;
- int rules_f0, first_rule = 0;
+ unsigned int rules_f0, first_rule = 0;
+ struct nft_pipapo_elem *e;
const u8 *data;
+ e = nft_elem_priv_cast(elem_priv);
data = (const u8 *)nft_set_ext_key(&e->ext);
- e = pipapo_get(net, set, data, 0);
- if (IS_ERR(e))
- return;
-
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const u8 *match_start, *match_end;
@@ -1893,12 +2051,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
int i, start, rules_fx;
match_start = data;
- match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+
+ if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END))
+ match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+ else
+ match_end = data;
start = first_rule;
rules_fx = rules_f0;
nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+
if (!pipapo_match_field(f, start, rules_fx,
match_start, match_end))
break;
@@ -1911,49 +2075,42 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
- }
- if (i == m->field_count) {
- priv->dirty = true;
- pipapo_drop(m, rulemap);
- pipapo_commit(set);
- return;
+ if (last && f->mt[rulemap[i].to].e == e) {
+ pipapo_drop(m, rulemap);
+ return;
+ }
}
first_rule += rules_f0;
}
+
+ WARN_ON_ONCE(1); /* elem_priv not found */
}
/**
- * nft_pipapo_walk() - Walk over elements
+ * nft_pipapo_do_walk() - Walk over elements in m
* @ctx: nftables API context
* @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
* @iter: Iterator
*
* As elements are referenced in the mapping array for the last field, directly
* scan that array: there's no need to follow rule mappings from the first
- * field.
+ * field. @m is protected either by RCU read lock or by transaction mutex.
*/
-static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_pipapo_match *m,
+ struct nft_set_iter *iter)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
- int i, r;
-
- rcu_read_lock();
- m = rcu_dereference(priv->match);
-
- if (unlikely(!m))
- goto out;
+ const struct nft_pipapo_field *f;
+ unsigned int i, r;
for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
;
for (r = 0; r < f->rules; r++) {
struct nft_pipapo_elem *e;
- struct nft_set_elem elem;
if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
continue;
@@ -1962,21 +2119,52 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
goto cont;
e = f->mt[r].e;
- if (nft_set_elem_expired(&e->ext))
- goto cont;
-
- elem.priv = e;
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &e->priv);
if (iter->err < 0)
- goto out;
+ return;
cont:
iter->count++;
}
+}
-out:
- rcu_read_unlock();
+/**
+ * nft_pipapo_walk() - Walk over elements
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @iter: Iterator
+ *
+ * Test if destructive action is needed or not, clone active backend if needed
+ * and call the real function to work on the data.
+ */
+static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ m = pipapo_maybe_clone(set);
+ if (!m) {
+ iter->err = -ENOMEM;
+ return;
+ }
+
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ break;
+ case NFT_ITER_READ:
+ rcu_read_lock();
+ m = rcu_dereference(priv->match);
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ rcu_read_unlock();
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
}
/**
@@ -2039,20 +2227,24 @@ static int nft_pipapo_init(const struct nft_set *set,
struct nft_pipapo_field *f;
int err, i, field_count;
+ BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0);
+
field_count = desc->field_count ? : 1;
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255);
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT);
+
if (field_count > NFT_PIPAPO_MAX_FIELDS)
return -EINVAL;
- m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count,
- GFP_KERNEL);
+ m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL);
if (!m)
return -ENOMEM;
m->field_count = field_count;
m->bsize_max = 0;
- m->scratch = alloc_percpu(unsigned long *);
+ m->scratch = alloc_percpu(struct nft_pipapo_scratch *);
if (!m->scratch) {
err = -ENOMEM;
goto out_scratch;
@@ -2060,20 +2252,14 @@ static int nft_pipapo_init(const struct nft_set *set,
for_each_possible_cpu(i)
*per_cpu_ptr(m->scratch, i) = NULL;
-#ifdef NFT_PIPAPO_ALIGN
- m->scratch_aligned = alloc_percpu(unsigned long *);
- if (!m->scratch_aligned) {
- err = -ENOMEM;
- goto out_free;
- }
- for_each_possible_cpu(i)
- *per_cpu_ptr(m->scratch_aligned, i) = NULL;
-#endif
-
rcu_head_init(&m->rcu);
nft_pipapo_for_each_field(f, i, m) {
- int len = desc->field_len[i] ? : set->klen;
+ unsigned int len = desc->field_len[i] ? : set->klen;
+
+ /* f->groups is u8 */
+ BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES *
+ BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256);
f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
@@ -2082,28 +2268,15 @@ static int nft_pipapo_init(const struct nft_set *set,
f->bsize = 0;
f->rules = 0;
- NFT_PIPAPO_LT_ASSIGN(f, NULL);
+ f->rules_alloc = 0;
+ f->lt = NULL;
f->mt = NULL;
}
- /* Create an initial clone of matching data for next insertion */
- priv->clone = pipapo_clone(m);
- if (IS_ERR(priv->clone)) {
- err = PTR_ERR(priv->clone);
- goto out_free;
- }
-
- priv->dirty = false;
-
rcu_assign_pointer(priv->match, m);
return 0;
-out_free:
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
- free_percpu(m->scratch);
out_scratch:
kfree(m);
@@ -2111,57 +2284,55 @@ out_scratch:
}
/**
- * nft_pipapo_destroy() - Free private data for set and all committed elements
+ * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @ctx: context
* @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
*/
-static void nft_pipapo_destroy(const struct nft_set *set)
+static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ struct nft_pipapo_match *m)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m;
struct nft_pipapo_field *f;
- int i, r, cpu;
+ unsigned int i, r;
- m = rcu_dereference_protected(priv->match, true);
- if (m) {
- rcu_barrier();
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
- for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
- ;
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
- for (r = 0; r < f->rules; r++) {
- struct nft_pipapo_elem *e;
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
- if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
- continue;
+ e = f->mt[r].e;
- e = f->mt[r].e;
+ nf_tables_set_elem_destroy(ctx, set, &e->priv);
+ }
+}
- nft_set_elem_destroy(set, e, true);
- }
+/**
+ * nft_pipapo_destroy() - Free private data for set and all committed elements
+ * @ctx: context
+ * @set: nftables API set representation
+ */
+static void nft_pipapo_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(m->scratch_aligned);
-#endif
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(m->scratch, cpu));
- free_percpu(m->scratch);
- pipapo_free_fields(m);
- kfree(m);
- priv->match = NULL;
- }
+ m = rcu_dereference_protected(priv->match, true);
if (priv->clone) {
-#ifdef NFT_PIPAPO_ALIGN
- free_percpu(priv->clone->scratch_aligned);
-#endif
- for_each_possible_cpu(cpu)
- kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
- free_percpu(priv->clone->scratch);
-
- pipapo_free_fields(priv->clone);
- kfree(priv->clone);
+ nft_set_pipapo_match_destroy(ctx, set, priv->clone);
+ pipapo_free_match(priv->clone);
priv->clone = NULL;
+ } else {
+ nft_set_pipapo_match_destroy(ctx, set, m);
}
+
+ pipapo_free_match(m);
}
/**
@@ -2197,6 +2368,8 @@ const struct nft_set_type nft_set_pipapo_type = {
.init = nft_pipapo_init,
.destroy = nft_pipapo_destroy,
.gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
@@ -2219,6 +2392,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = {
.init = nft_pipapo_init,
.destroy = nft_pipapo_destroy,
.gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 25a75591583e..eaab422aa56a 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -70,15 +70,9 @@
#define NFT_PIPAPO_ALIGN_HEADROOM \
(NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
-#define NFT_PIPAPO_LT_ASSIGN(field, x) \
- do { \
- (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
- (field)->lt = (x); \
- } while (0)
#else
#define NFT_PIPAPO_ALIGN_HEADROOM 0
#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
-#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
#endif /* NFT_PIPAPO_ALIGN */
#define nft_pipapo_for_each_field(field, index, match) \
@@ -110,44 +104,50 @@ union nft_pipapo_map_bucket {
/**
* struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups: Amount of bit groups
* @rules: Number of inserted rules
* @bsize: Size of each bucket in lookup table, in longs
+ * @rules_alloc: Number of allocated rules, always >= rules
+ * @groups: Amount of bit groups
* @bb: Number of bits grouped together in lookup table buckets
* @lt: Lookup table: 'groups' rows of buckets
- * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
* @mt: Mapping table: one bucket per rule
*/
struct nft_pipapo_field {
- int groups;
- unsigned long rules;
- size_t bsize;
- int bb;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long *lt_aligned;
-#endif
+ unsigned int rules;
+ unsigned int bsize;
+ unsigned int rules_alloc;
+ u8 groups;
+ u8 bb;
unsigned long *lt;
union nft_pipapo_map_bucket *mt;
};
/**
+ * struct nft_pipapo_scratch - percpu data used for lookup and matching
+ * @bh_lock: PREEMPT_RT local spinlock
+ * @map_index: Current working bitmap index, toggled between field matches
+ * @__map: store partial matching results during lookup
+ */
+struct nft_pipapo_scratch {
+ local_lock_t bh_lock;
+ u8 map_index;
+ unsigned long __map[];
+};
+
+/**
* struct nft_pipapo_match - Data used for lookup and matching
- * @field_count Amount of fields in set
- * @scratch: Preallocated per-CPU maps for partial matching results
- * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes
+ * @field_count: Amount of fields in set
* @bsize_max: Maximum lookup table bucket size of all fields, in longs
- * @rcu Matching data is swapped on commits
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @rcu: Matching data is swapped on commits
* @f: Fields, with lookup and mapping tables
*/
struct nft_pipapo_match {
- int field_count;
-#ifdef NFT_PIPAPO_ALIGN
- unsigned long * __percpu *scratch_aligned;
-#endif
- unsigned long * __percpu *scratch;
- size_t bsize_max;
+ u8 field_count;
+ unsigned int bsize_max;
+ struct nft_pipapo_scratch * __percpu *scratch;
struct rcu_head rcu;
- struct nft_pipapo_field f[];
+ struct nft_pipapo_field f[] __counted_by(field_count);
};
/**
@@ -155,14 +155,12 @@ struct nft_pipapo_match {
* @match: Currently in-use matching data
* @clone: Copy where pending insertions and deletions are kept
* @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
* @last_gc: Timestamp of last garbage collection run, jiffies
*/
struct nft_pipapo {
struct nft_pipapo_match __rcu *match;
struct nft_pipapo_match *clone;
int width;
- bool dirty;
unsigned long last_gc;
};
@@ -170,14 +168,17 @@ struct nft_pipapo_elem;
/**
* struct nft_pipapo_elem - API-facing representation of single set element
+ * @priv: element placeholder
* @ext: nftables API extensions
*/
struct nft_pipapo_elem {
- struct nft_set_ext ext;
+ struct nft_elem_priv priv;
+ struct nft_set_ext ext;
};
-int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
- union nft_pipapo_map_bucket *mt, bool match_only);
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only);
/**
* pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
@@ -185,7 +186,7 @@ int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
-static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
@@ -213,7 +214,7 @@ static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
-static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
@@ -277,4 +278,25 @@ static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
return size;
}
+/**
+ * pipapo_resmap_init() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Initialize all bits covered by the first field to one, so that after
+ * the first step, only the matching bits of the first bit group remain.
+ *
+ * If other fields have a large bitmap, set remainder of res_map to 0.
+ */
+static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ for (i = 0; i < f->bsize; i++)
+ res_map[i] = ULONG_MAX;
+
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index d65ae0e23028..7ff90325c97f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -57,7 +57,7 @@
/* Jump to label if @reg is zero */
#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
- asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
"je %l[" #label "]" : : : : label)
/* Store 256 bits from YMM register into memory. Contrary to bucket load
@@ -71,9 +71,6 @@
#define NFT_PIPAPO_AVX2_ZERO(reg) \
asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
-/* Current working bitmap index, toggled between field matches */
-static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
-
/**
* nft_pipapo_avx2_prepare() - Prepare before main algorithm body
*
@@ -142,7 +139,6 @@ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
* @map: Bitmap to be scanned for set bits
* @dst: Destination bitmap
* @mt: Mapping table containing bit set specifiers
- * @len: Length of bitmap in longs
* @last: Return index of first set bit, if this is the last field
*
* This is an alternative implementation of pipapo_refill() suitable for usage
@@ -216,8 +212,9 @@ static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
@@ -278,8 +275,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
@@ -354,8 +352,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -449,8 +448,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -538,8 +538,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
@@ -673,8 +674,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -730,8 +732,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -794,8 +797,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -869,8 +873,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -887,7 +892,7 @@ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
- NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize);
NFT_PIPAPO_AVX2_AND(7, 2, 3);
/* Stall */
@@ -954,8 +959,9 @@ nothing:
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
@@ -988,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(3, 4, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
- NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 3, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
@@ -1030,6 +1037,7 @@ nothing:
/**
* nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @mdata: Matching data, including mapping table
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
@@ -1045,17 +1053,17 @@ nothing:
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
-static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
- struct nft_pipapo_field *f, int offset,
- const u8 *pkt, bool first, bool last)
+static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata,
+ unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
{
- unsigned long *lt = f->lt, bsize = f->bsize;
+ unsigned long bsize = f->bsize;
int i, ret = -1, b;
- lt += offset * NFT_PIPAPO_LONGS_PER_M256;
-
if (first)
- memset(map, 0xff, bsize * sizeof(*map));
+ pipapo_resmap_init(mdata, map);
for (i = offset; i < bsize; i++) {
if (f->bb == 8)
@@ -1091,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ if (!boot_cpu_has(X86_FEATURE_AVX2))
return false;
est->size = pipapo_estimate_size(desc);
@@ -1106,57 +1114,78 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
}
/**
- * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- * @ext: nftables API extension pointer, filled with matching reference
+ * pipapo_resmap_init_avx2() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Like pipapo_resmap_init() but do not set start map bits covered by the first field.
+ */
+static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ /* Starting map doesn't need to be set to all-ones for this implementation,
+ * but we do need to zero the remaining bits, if any.
+ */
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
+
+/**
+ * pipapo_get_avx2() - Lookup function for AVX2 implementation
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
- * Return: true on match, false otherwise.
+ * The caller must check that the FPU is usable.
+ * This function must be called with BH disabled.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- unsigned long *res, *fill, *scratch;
- u8 genmask = nft_genmask_cur(net);
- const u8 *rp = (const u8 *)key;
- struct nft_pipapo_match *m;
- struct nft_pipapo_field *f;
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
+ unsigned long *res, *fill, *map;
bool map_index;
- int i, ret = 0;
+ int i;
- m = rcu_dereference(priv->match);
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ return NULL;
- /* This also protects access to all data related to scratch maps */
- kernel_fpu_begin();
+ __local_lock_nested_bh(&scratch->bh_lock);
+ map_index = scratch->map_index;
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res = map + (map_index ? m->bsize_max : 0);
+ fill = map + (map_index ? 0 : m->bsize_max);
- scratch = *raw_cpu_ptr(m->scratch_aligned);
- if (unlikely(!scratch)) {
- kernel_fpu_end();
- return false;
- }
- map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
-
- res = scratch + (map_index ? m->bsize_max : 0);
- fill = scratch + (map_index ? 0 : m->bsize_max);
+ pipapo_resmap_init_avx2(m, res);
- /* Starting map doesn't need to be set for this implementation */
+ /* Note that we don't need a valid MXCSR state for any of the
+ * operations we use here, so pass 0 as mask and spare a LDMXCSR
+ * instruction.
+ */
+ kernel_fpu_begin_mask(0);
nft_pipapo_avx2_prepare();
-next_match:
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1, first = !i;
+ int ret = 0;
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
- ret, rp, \
+ ret, data, \
first, last))
if (likely(f->bb == 8)) {
@@ -1171,8 +1200,8 @@ next_match:
} else if (f->groups == 16) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
- ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
- ret, rp,
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
first, last);
}
} else {
@@ -1187,8 +1216,8 @@ next_match:
} else if (f->groups == 32) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
- ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
- ret, rp,
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
first, last);
}
}
@@ -1196,28 +1225,78 @@ next_match:
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
- if (ret < 0)
- goto out;
+next_match:
+ if (ret < 0) {
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+ }
if (last) {
- *ext = &f->mt[ret].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask))) {
- ret = 0;
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[ret].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask))) {
+ ret = pipapo_refill(res, f->bsize, f->rules,
+ fill, f->mt, last);
goto next_match;
}
- goto out;
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return e;
}
+ map_index = !map_index;
swap(res, fill);
- rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
-out:
- if (i % 2)
- raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy using
+ * the AVX2 routines if the FPU is usable or fall back to the generic
+ * implementation of the algorithm otherwise.
+ *
+ * Return: nftables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const u8 *rp = (const u8 *)key;
+ const struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+ if (unlikely(!irq_fpu_usable())) {
+ const struct nft_set_ext *ext;
+
+ ext = nft_pipapo_lookup(net, set, key);
+
+ local_bh_enable();
+ return ext;
+ }
+
+ m = rcu_dereference(priv->match);
+
+ e = pipapo_get_avx2(m, rp, 0, get_jiffies_64());
+ local_bh_enable();
- return ret >= 0;
+ return e ? &e->ext : NULL;
}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index 394bcb704db7..c2999b63da3f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -5,10 +5,12 @@
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+struct nft_pipapo_match;
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp);
#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 62f416bc0579..ca594161b840 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -18,11 +18,12 @@
struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
- seqcount_t count;
- struct delayed_work gc_work;
+ seqcount_rwlock_t count;
+ unsigned long last_gc;
};
struct nft_rbtree_elem {
+ struct nft_elem_priv priv;
struct rb_node node;
struct nft_set_ext ext;
};
@@ -38,40 +39,47 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
return !nft_rbtree_interval_end(rbe);
}
-static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
- const struct nft_rbtree_elem *interval)
+static int nft_rbtree_cmp(const struct nft_set *set,
+ const struct nft_rbtree_elem *e1,
+ const struct nft_rbtree_elem *e2)
{
- return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+ return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
+ set->klen);
}
-static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext,
- unsigned int seq)
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_elem_expired(&rbe->ext);
+}
+
+static const struct nft_set_ext *
+__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, unsigned int seq)
{
struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
- const void *this;
int d;
parent = rcu_dereference_raw(priv->root.rb_node);
while (parent != NULL) {
if (read_seqcount_retry(&priv->count, seq))
- return false;
+ return NULL;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- this = nft_set_ext_key(&rbe->ext);
- d = memcmp(this, key, set->klen);
+ d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
if (d < 0) {
parent = rcu_dereference_raw(parent->rb_left);
if (interval &&
- nft_rbtree_equal(set, this, interval) &&
+ !nft_rbtree_cmp(set, rbe, interval) &&
nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(interval))
continue;
- interval = rbe;
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_rbtree_elem_expired(rbe))
+ interval = rbe;
} else if (d > 0)
parent = rcu_dereference_raw(parent->rb_right);
else {
@@ -80,50 +88,47 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
continue;
}
- if (nft_set_elem_expired(&rbe->ext))
- return false;
+ if (nft_rbtree_elem_expired(rbe))
+ return NULL;
if (nft_rbtree_interval_end(rbe)) {
if (nft_set_is_anonymous(set))
- return false;
+ return NULL;
parent = rcu_dereference_raw(parent->rb_left);
interval = NULL;
continue;
}
- *ext = &rbe->ext;
- return true;
+ return &rbe->ext;
}
}
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_set_elem_active(&interval->ext, genmask) &&
- !nft_set_elem_expired(&interval->ext) &&
- nft_rbtree_interval_start(interval)) {
- *ext = &interval->ext;
- return true;
- }
+ nft_rbtree_interval_start(interval))
+ return &interval->ext;
- return false;
+ return NULL;
}
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
- bool ret;
+ const struct nft_set_ext *ext;
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
- if (ret || !read_seqcount_retry(&priv->count, seq))
- return ret;
+ ext = __nft_rbtree_lookup(net, set, key, seq);
+ if (ext || !read_seqcount_retry(&priv->count, seq))
+ return ext;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
+ ext = __nft_rbtree_lookup(net, set, key, seq);
read_unlock_bh(&priv->lock);
- return ret;
+ return ext;
}
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
@@ -190,8 +195,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
return false;
}
-static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rbtree_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
@@ -202,118 +208,274 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
if (ret || !read_seqcount_retry(&priv->count, seq))
- return rbe;
+ return &rbe->priv;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
- if (!ret)
- rbe = ERR_PTR(-ENOENT);
read_unlock_bh(&priv->lock);
- return rbe;
+ if (!ret)
+ return ERR_PTR(-ENOENT);
+
+ return &rbe->priv;
+}
+
+static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ lockdep_assert_held_write(&priv->lock);
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ rb_erase(&rbe->node, &priv->root);
+}
+
+static const struct nft_rbtree_elem *
+nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ struct nft_set *set = (struct nft_set *)__set;
+ struct rb_node *prev = rb_prev(&rbe->node);
+ struct net *net = read_pnet(&set->net);
+ struct nft_rbtree_elem *rbe_prev;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+ if (!gc)
+ return ERR_PTR(-ENOMEM);
+
+ /* search for end interval coming before this element.
+ * end intervals don't carry a timeout extension, they
+ * are coupled with the interval start element.
+ */
+ while (prev) {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ if (nft_rbtree_interval_end(rbe_prev) &&
+ nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
+ break;
+
+ prev = rb_prev(prev);
+ }
+
+ rbe_prev = NULL;
+ if (prev) {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
+
+ /* There is always room in this trans gc for this element,
+ * memory allocation never actually happens, hence, the warning
+ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+ * this is synchronous gc which never fails.
+ */
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
+
+ nft_trans_gc_elem_add(gc, rbe_prev);
+ }
+
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
+
+ nft_trans_gc_elem_add(gc, rbe);
+
+ nft_trans_gc_queue_sync_done(gc);
+
+ return rbe_prev;
+}
+
+static bool nft_rbtree_update_first(const struct nft_set *set,
+ struct nft_rbtree_elem *rbe,
+ struct rb_node *first)
+{
+ struct nft_rbtree_elem *first_elem;
+
+ first_elem = rb_entry(first, struct nft_rbtree_elem, node);
+ /* this element is closest to where the new element is to be inserted:
+ * update the first element for the node list path.
+ */
+ if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
+ return true;
+
+ return false;
}
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
+ struct rb_node *node, *next, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
+ u8 cur_genmask = nft_genmask_cur(net);
u8 genmask = nft_genmask_next(net);
- struct nft_rbtree_elem *rbe;
- struct rb_node *parent, **p;
- bool overlap = false;
+ u64 tstamp = nft_net_tstamp(net);
int d;
- /* Detect overlaps as we descend the tree. Set the flag in these cases:
- *
- * a1. _ _ __>| ?_ _ __| (insert end before existing end)
- * a2. _ _ ___| ?_ _ _>| (insert end after existing end)
- * a3. _ _ ___? >|_ _ __| (insert start before existing end)
- *
- * and clear it later on, as we eventually reach the points indicated by
- * '?' above, in the cases described below. We'll always meet these
- * later, locally, due to tree ordering, and overlaps for the intervals
- * that are the closest together are always evaluated last.
- *
- * b1. _ _ __>| !_ _ __| (insert end before existing start)
- * b2. _ _ ___| !_ _ _>| (insert end after existing start)
- * b3. _ _ ___! >|_ _ __| (insert start after existing end)
- *
- * Case a3. resolves to b3.:
- * - if the inserted start element is the leftmost, because the '0'
- * element in the tree serves as end element
- * - otherwise, if an existing end is found. Note that end elements are
- * always inserted after corresponding start elements.
- *
- * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
- * in that order.
- *
- * The flag is also cleared in two special cases:
- *
- * b4. |__ _ _!|<_ _ _ (insert start right before existing end)
- * b5. |__ _ >|!__ _ _ (insert end right after existing start)
- *
- * which always happen as last step and imply that no further
- * overlapping is possible.
+ /* Descend the tree to search for an existing element greater than the
+ * key value to insert that is greater than the new element. This is the
+ * first element to walk the ordered elements to find possible overlap.
*/
-
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
parent = *p;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext),
- nft_set_ext_key(&new->ext),
- set->klen);
+ d = nft_rbtree_cmp(set, rbe, new);
+
if (d < 0) {
p = &parent->rb_left;
-
- if (nft_rbtree_interval_start(new)) {
- if (nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext, genmask))
- overlap = false;
- } else {
- overlap = nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext,
- genmask);
- }
} else if (d > 0) {
- p = &parent->rb_right;
+ if (!first ||
+ nft_rbtree_update_first(set, rbe, first))
+ first = &rbe->node;
- if (nft_rbtree_interval_end(new)) {
- overlap = nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext,
- genmask);
- } else if (nft_rbtree_interval_end(rbe) &&
- nft_set_elem_active(&rbe->ext, genmask)) {
- overlap = true;
- }
+ p = &parent->rb_right;
} else {
- if (nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_start(new)) {
+ if (nft_rbtree_interval_end(rbe))
p = &parent->rb_left;
-
- if (nft_set_elem_active(&rbe->ext, genmask))
- overlap = false;
- } else if (nft_rbtree_interval_start(rbe) &&
- nft_rbtree_interval_end(new)) {
+ else
p = &parent->rb_right;
+ }
+ }
- if (nft_set_elem_active(&rbe->ext, genmask))
- overlap = false;
- } else if (nft_set_elem_active(&rbe->ext, genmask)) {
- *ext = &rbe->ext;
- return -EEXIST;
- } else {
- p = &parent->rb_left;
+ if (!first)
+ first = rb_first(&priv->root);
+
+ /* Detect overlap by going through the list of valid tree nodes.
+ * Values stored in the tree are in reversed order, starting from
+ * highest to lowest value.
+ */
+ for (node = first; node != NULL; node = next) {
+ next = rb_next(node);
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ if (!nft_set_elem_active(&rbe->ext, genmask))
+ continue;
+
+ /* perform garbage collection to avoid bogus overlap reports
+ * but skip new elements in this transaction.
+ */
+ if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
+ nft_set_elem_active(&rbe->ext, cur_genmask)) {
+ const struct nft_rbtree_elem *removed_end;
+
+ removed_end = nft_rbtree_gc_elem(set, priv, rbe);
+ if (IS_ERR(removed_end))
+ return PTR_ERR(removed_end);
+
+ if (removed_end == rbe_le || removed_end == rbe_ge)
+ return -EAGAIN;
+
+ continue;
+ }
+
+ d = nft_rbtree_cmp(set, rbe, new);
+ if (d == 0) {
+ /* Matching end element: no need to look for an
+ * overlapping greater or equal element.
+ */
+ if (nft_rbtree_interval_end(rbe)) {
+ rbe_le = rbe;
+ break;
+ }
+
+ /* first element that is greater or equal to key value. */
+ if (!rbe_ge) {
+ rbe_ge = rbe;
+ continue;
}
+
+ /* this is a closer more or equal element, update it. */
+ if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
+ rbe_ge = rbe;
+ continue;
+ }
+
+ /* element is equal to key value, make sure flags are
+ * the same, an existing more or equal start element
+ * must not be replaced by more or equal end element.
+ */
+ if ((nft_rbtree_interval_start(new) &&
+ nft_rbtree_interval_start(rbe_ge)) ||
+ (nft_rbtree_interval_end(new) &&
+ nft_rbtree_interval_end(rbe_ge))) {
+ rbe_ge = rbe;
+ continue;
+ }
+ } else if (d > 0) {
+ /* annotate element greater than the new element. */
+ rbe_ge = rbe;
+ continue;
+ } else if (d < 0) {
+ /* annotate element less than the new element. */
+ rbe_le = rbe;
+ break;
}
}
- if (overlap)
+ /* - new start element matching existing start element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
+ nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
+ *elem_priv = &rbe_ge->priv;
+ return -EEXIST;
+ }
+
+ /* - new end element matching existing end element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
+ nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
+ *elem_priv = &rbe_le->priv;
+ return -EEXIST;
+ }
+
+ /* - new start element with existing closest, less or equal key value
+ * being a start element: partial overlap, reported as -ENOTEMPTY.
+ * Anonymous sets allow for two consecutive start element since they
+ * are constant, skip them to avoid bogus overlap reports.
+ */
+ if (!nft_set_is_anonymous(set) && rbe_le &&
+ nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
+ return -ENOTEMPTY;
+
+ /* - new end element with existing closest, less or equal key value
+ * being a end element: partial overlap, reported as -ENOTEMPTY.
+ */
+ if (rbe_le &&
+ nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
+ return -ENOTEMPTY;
+
+ /* - new end element with existing closest, greater or equal key value
+ * being an end element: partial overlap, reported as -ENOTEMPTY
+ */
+ if (rbe_ge &&
+ nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
return -ENOTEMPTY;
+ /* Accepted element: pick insertion point depending on key value */
+ parent = NULL;
+ p = &priv->root.rb_node;
+ while (*p != NULL) {
+ parent = *p;
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ d = nft_rbtree_cmp(set, rbe, new);
+
+ if (d < 0)
+ p = &parent->rb_left;
+ else if (d > 0)
+ p = &parent->rb_right;
+ else if (nft_rbtree_interval_end(rbe))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -321,66 +483,74 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
int err;
+ do {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ cond_resched();
+
+ write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
+ err = __nft_rbtree_insert(net, set, rbe, elem_priv);
+ write_seqcount_end(&priv->count);
+ write_unlock_bh(&priv->lock);
+ } while (err == -EAGAIN);
+
+ return err;
+}
+
+static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
+{
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
- err = __nft_rbtree_insert(net, set, rbe, ext);
+ rb_erase(&rbe->node, &priv->root);
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
-
- return err;
}
static void nft_rbtree_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- rb_erase(&rbe->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
+ nft_rbtree_erase(priv, rbe);
}
static void nft_rbtree_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = elem->priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &rbe->ext);
- nft_set_elem_clear_busy(&rbe->ext);
+ nft_clear(net, &rbe->ext);
}
-static bool nft_rbtree_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rbtree_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&rbe->ext) ||
- !nft_is_active(net, &rbe->ext)) {
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &rbe->ext);
}
-static void *nft_rbtree_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_next(net);
+ u64 tstamp = nft_net_tstamp(net);
int d;
while (parent != NULL) {
@@ -401,106 +571,132 @@ static void *nft_rbtree_deactivate(const struct net *net,
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
+ } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
+ break;
} else if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = parent->rb_left;
continue;
}
- nft_rbtree_flush(net, set, rbe);
- return rbe;
+ nft_rbtree_flush(net, set, &rbe->priv);
+ return &rbe->priv;
}
}
return NULL;
}
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
- struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
- struct nft_set_elem elem;
struct rb_node *node;
- read_lock_bh(&priv->lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&rbe->ext))
- goto cont;
- if (!nft_set_elem_active(&rbe->ext, iter->genmask))
- goto cont;
- elem.priv = rbe;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
- if (iter->err < 0) {
- read_unlock_bh(&priv->lock);
+ iter->err = iter->fn(ctx, set, iter, &rbe->priv);
+ if (iter->err < 0)
return;
- }
cont:
iter->count++;
}
- read_unlock_bh(&priv->lock);
}
-static void nft_rbtree_gc(struct work_struct *work)
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
- struct nft_set_gc_batch *gcb = NULL;
- struct nft_rbtree *priv;
- struct rb_node *node;
- struct nft_set *set;
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+ nft_rbtree_do_walk(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ read_lock_bh(&priv->lock);
+ nft_rbtree_do_walk(ctx, set, iter);
+ read_unlock_bh(&priv->lock);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ nft_rbtree_erase(priv, rbe);
+}
+
+static void nft_rbtree_gc(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+ struct net *net = read_pnet(&set->net);
+ u64 tstamp = nft_net_tstamp(net);
+ struct rb_node *node, *next;
+ struct nft_trans_gc *gc;
- priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ for (node = rb_first(&priv->root); node ; node = next) {
+ next = rb_next(node);
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ /* elements are reversed in the rbtree for historical reasons,
+ * from highest to lowest value, that is why end element is
+ * always visited before the start element.
+ */
if (nft_rbtree_interval_end(rbe)) {
rbe_end = rbe;
continue;
}
- if (!nft_set_elem_expired(&rbe->ext))
- continue;
- if (nft_set_elem_mark_busy(&rbe->ext))
+ if (!__nft_set_elem_expired(&rbe->ext, tstamp))
continue;
- if (rbe_prev) {
- rb_erase(&rbe_prev->node, &priv->root);
- rbe_prev = NULL;
- }
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (!gcb)
- break;
-
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- rbe_prev = rbe;
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+ /* end element needs to be removed first, it has
+ * no timeout extension.
+ */
if (rbe_end) {
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_end);
- rb_erase(&rbe_end->node, &priv->root);
+ nft_rbtree_gc_remove(net, set, priv, rbe_end);
+ nft_trans_gc_elem_add(gc, rbe_end);
rbe_end = NULL;
}
- node = rb_next(node);
- if (!node)
- break;
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+
+ nft_rbtree_gc_remove(net, set, priv, rbe);
+ nft_trans_gc_elem_add(gc, rbe);
}
- if (rbe_prev)
- rb_erase(&rbe_prev->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
- nft_set_gc_batch_complete(gcb);
+try_later:
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+ if (gc) {
+ gc = nft_trans_gc_catchall_sync(gc);
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -515,30 +711,26 @@ static int nft_rbtree_init(const struct nft_set *set,
{
struct nft_rbtree *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
+
rwlock_init(&priv->lock);
- seqcount_init(&priv->count);
+ seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
- INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
- if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
-
return 0;
}
-static void nft_rbtree_destroy(const struct nft_set *set)
+static void nft_rbtree_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct rb_node *node;
- cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- nft_set_elem_destroy(set, rbe, true);
+ nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
}
}
@@ -560,6 +752,61 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
+static void nft_rbtree_commit(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ nft_rbtree_gc(set);
+}
+
+static void nft_rbtree_gc_init(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ priv->last_gc = jiffies;
+}
+
+/* rbtree stores ranges as singleton elements, each range is composed of two
+ * elements ...
+ */
+static u32 nft_rbtree_ksize(u32 size)
+{
+ return size * 2;
+}
+
+/* ... hide this detail to userspace. */
+static u32 nft_rbtree_usize(u32 size)
+{
+ if (!size)
+ return 0;
+
+ return size / 2;
+}
+
+static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+ const void *key;
+
+ node = rb_last(&priv->root);
+ if (!node)
+ return 0;
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_rbtree_interval_end(rbe))
+ return 0;
+
+ key = nft_set_ext_key(&rbe->ext);
+ if (memchr(key, 1, set->klen))
+ return 0;
+
+ /* this is the all-zero no-match element. */
+ return 1;
+}
+
const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
@@ -573,8 +820,13 @@ const struct nft_set_type nft_set_rbtree_type = {
.deactivate = nft_rbtree_deactivate,
.flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
+ .commit = nft_rbtree_commit,
+ .gc_init = nft_rbtree_gc_init,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
.get = nft_rbtree_get,
+ .ksize = nft_rbtree_ksize,
+ .usize = nft_rbtree_usize,
+ .adjust_maxsize = nft_rbtree_adjust_maxsize,
},
};
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 637ce3e8c575..36affbb697c2 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -9,11 +9,101 @@
struct nft_socket {
enum nft_socket_keys key:8;
+ u8 level; /* cgroupv2 level to extract */
+ u8 level_user; /* cgroupv2 level provided by userspace */
+ u8 len;
union {
- enum nft_registers dreg:8;
+ u8 dreg;
};
};
+static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
+ struct nft_regs *regs, struct sock *sk,
+ u32 *dest)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+ break;
+#endif
+ default:
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static noinline bool
+nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
+{
+ struct cgroup *cgrp;
+ u64 cgid;
+
+ if (!sk_fullsock(sk))
+ return false;
+
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
+ return false;
+
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
+ return true;
+}
+
+/* process context only, uses current->nsproxy. */
+static noinline int nft_socket_cgroup_subtree_level(void)
+{
+ struct cgroup *cgrp = cgroup_get_from_path("/");
+ int level;
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ level = cgrp->level;
+
+ cgroup_put(cgrp);
+
+ if (level > 255)
+ return -ERANGE;
+
+ if (WARN_ON_ONCE(level < 0))
+ return -EINVAL;
+
+ return level;
+}
+#endif
+
+static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
+{
+ const struct net_device *indev = nft_in(pkt);
+ const struct sk_buff *skb = pkt->skb;
+ struct sock *sk = NULL;
+
+ if (!indev)
+ return NULL;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev);
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return sk;
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -27,20 +117,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
sk = NULL;
if (!sk)
- switch(nft_pf(pkt)) {
- case NFPROTO_IPV4:
- sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
- break;
-#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
- case NFPROTO_IPV6:
- sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
- break;
-#endif
- default:
- WARN_ON_ONCE(1);
- regs->verdict.code = NFT_BREAK;
- return;
- }
+ sk = nft_socket_do_lookup(pkt);
if (!sk) {
regs->verdict.code = NFT_BREAK;
@@ -53,24 +130,41 @@ static void nft_socket_eval(const struct nft_expr *expr,
break;
case NFT_SOCKET_MARK:
if (sk_fullsock(sk)) {
- *dest = sk->sk_mark;
+ *dest = READ_ONCE(sk->sk_mark);
} else {
regs->verdict.code = NFT_BREAK;
- return;
+ goto out_put_sk;
+ }
+ break;
+ case NFT_SOCKET_WILDCARD:
+ if (!sk_fullsock(sk)) {
+ regs->verdict.code = NFT_BREAK;
+ goto out_put_sk;
}
+ nft_socket_wildcard(pkt, regs, sk, dest);
break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case NFT_SOCKET_CGROUPV2:
+ if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) {
+ regs->verdict.code = NFT_BREAK;
+ goto out_put_sk;
+ }
+ break;
+#endif
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
}
+out_put_sk:
if (sk != skb->sk)
sock_gen_put(sk);
}
static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
- [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
+ [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_SOCKET_DREG] = { .type = NLA_U32 },
+ [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_socket_init(const struct nft_ctx *ctx,
@@ -94,35 +188,106 @@ static int nft_socket_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
+ case NFT_SOCKET_WILDCARD:
len = sizeof(u8);
break;
case NFT_SOCKET_MARK:
len = sizeof(u32);
break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case NFT_SOCKET_CGROUPV2: {
+ unsigned int level;
+ int err;
+
+ if (!tb[NFTA_SOCKET_LEVEL])
+ return -EINVAL;
+
+ level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL]));
+ if (level > 255)
+ return -EOPNOTSUPP;
+
+ err = nft_socket_cgroup_subtree_level();
+ if (err < 0)
+ return err;
+
+ priv->level_user = level;
+
+ level += err;
+ /* Implies a giant cgroup tree */
+ if (level > 255)
+ return -EOPNOTSUPP;
+
+ priv->level = level;
+ len = sizeof(u64);
+ break;
+ }
+#endif
default:
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_socket_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_socket *priv = nft_expr_priv(expr);
- if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
return -1;
if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
return -1;
+ if (priv->key == NFT_SOCKET_CGROUPV2 &&
+ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user)))
+ return -1;
return 0;
}
+static bool nft_socket_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ const struct nft_socket *socket;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ socket = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != socket->key ||
+ priv->dreg != socket->dreg ||
+ priv->level != socket->level) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_socket_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+
static struct nft_expr_type nft_socket_type;
static const struct nft_expr_ops nft_socket_ops = {
.type = &nft_socket_type,
@@ -130,6 +295,8 @@ static const struct nft_expr_ops nft_socket_ops = {
.eval = nft_socket_eval,
.init = nft_socket_init,
.dump = nft_socket_dump,
+ .validate = nft_socket_validate,
+ .reduce = nft_socket_reduce,
};
static struct nft_expr_type nft_socket_type __read_mostly = {
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index e2c1fc608841..5d3e51825985 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -109,7 +109,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
{
struct synproxy_options opts = {};
struct sk_buff *skb = pkt->skb;
- int thoff = pkt->xt.thoff;
+ int thoff = nft_thoff(pkt);
const struct tcphdr *tcp;
struct tcphdr _tcph;
@@ -123,7 +123,7 @@ static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
return;
}
- tcp = skb_header_pointer(skb, pkt->xt.thoff,
+ tcp = skb_header_pointer(skb, thoff,
sizeof(struct tcphdr),
&_tcph);
if (!tcp) {
@@ -186,13 +186,14 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
break;
#endif
case NFPROTO_INET:
- case NFPROTO_BRIDGE:
err = nf_synproxy_ipv4_init(snet, ctx->net);
if (err)
goto nf_ct_failure;
err = nf_synproxy_ipv6_init(snet, ctx->net);
- if (err)
+ if (err) {
+ nf_synproxy_ipv4_fini(snet, ctx->net);
goto nf_ct_failure;
+ }
break;
}
@@ -217,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
break;
#endif
case NFPROTO_INET:
- case NFPROTO_BRIDGE:
nf_synproxy_ipv4_fini(snet, ctx->net);
nf_synproxy_ipv6_fini(snet, ctx->net);
break;
@@ -248,9 +248,13 @@ static void nft_synproxy_eval(const struct nft_expr *expr,
}
static int nft_synproxy_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD));
}
@@ -270,7 +274,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
nft_synproxy_do_destroy(ctx);
}
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_synproxy *priv = nft_expr_priv(expr);
@@ -286,6 +291,7 @@ static const struct nft_expr_ops nft_synproxy_ops = {
.dump = nft_synproxy_dump,
.type = &nft_synproxy_type,
.validate = nft_synproxy_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_synproxy_type __read_mostly = {
@@ -388,3 +394,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("synproxy");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY);
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index d67f83a0958d..50481280abd2 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -13,9 +13,9 @@
#endif
struct nft_tproxy {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_port:8;
- u8 family;
+ u8 sreg_addr;
+ u8 sreg_port;
+ u8 family;
};
static void nft_tproxy_eval_v4(const struct nft_expr *expr,
@@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
if (!hp) {
regs->verdict.code = NFT_BREAK;
@@ -46,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
if (priv->sreg_addr)
- taddr = regs->data[priv->sreg_addr];
+ taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]);
taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -82,16 +88,17 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
const struct nft_tproxy *priv = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct in6_addr taddr;
- int thoff = pkt->xt.thoff;
+ int thoff = nft_thoff(pkt);
struct udphdr _hdr, *hp;
+ struct in6_addr taddr;
__be16 tport = 0;
struct sock *sk;
int l4proto;
memset(&taddr, 0, sizeof(taddr));
- if (!pkt->tprot_set) {
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -117,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
if (priv->sreg_port)
- tport = nft_reg_load16(&regs->data[priv->sreg_port]);
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -176,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
- [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
+ [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
[NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
};
@@ -247,15 +254,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
}
if (tb[NFTA_TPROXY_REG_ADDR]) {
- priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR],
+ &priv->sreg_addr, alen);
if (err < 0)
return err;
}
if (tb[NFTA_TPROXY_REG_PORT]) {
- priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
- err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT],
+ &priv->sreg_port, sizeof(u16));
if (err < 0)
return err;
}
@@ -263,8 +270,31 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
return 0;
}
+static void nft_tproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ nf_defrag_ipv4_disable(ctx->net);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nf_defrag_ipv6_disable(ctx->net);
+ break;
+#endif
+ case NFPROTO_UNSPEC:
+ nf_defrag_ipv4_disable(ctx->net);
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ nf_defrag_ipv6_disable(ctx->net);
+#endif
+ break;
+ }
+}
+
static int nft_tproxy_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tproxy *priv = nft_expr_priv(expr);
@@ -282,13 +312,27 @@ static int nft_tproxy_dump(struct sk_buff *skb,
return 0;
}
+static int nft_tproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+}
+
static struct nft_expr_type nft_tproxy_type;
static const struct nft_expr_ops nft_tproxy_ops = {
.type = &nft_tproxy_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
.eval = nft_tproxy_eval,
.init = nft_tproxy_init,
+ .destroy = nft_tproxy_destroy,
.dump = nft_tproxy_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .validate = nft_tproxy_validate,
};
static struct nft_expr_type nft_tproxy_type __read_mostly = {
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 30be5787fbde..a12486ae089d 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -15,8 +15,9 @@
struct nft_tunnel {
enum nft_tunnel_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
enum nft_tunnel_mode mode:8;
+ u8 len;
};
static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -65,9 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
- [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
- [NFTA_TUNNEL_MODE] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_tunnel_get_init(const struct nft_ctx *ctx,
@@ -93,8 +94,6 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
-
if (tb[NFTA_TUNNEL_MODE]) {
priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE]));
if (priv->mode > NFT_TUNNEL_MODE_MAX)
@@ -103,12 +102,13 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
priv->mode = NFT_TUNNEL_MODE_NONE;
}
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_tunnel_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tunnel *priv = nft_expr_priv(expr);
@@ -124,6 +124,31 @@ nla_put_failure:
return -1;
}
+static bool nft_tunnel_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ const struct nft_tunnel *tunnel;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ tunnel = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != tunnel->key ||
+ priv->dreg != tunnel->dreg ||
+ priv->mode != tunnel->mode) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_tunnel_type;
static const struct nft_expr_ops nft_tunnel_get_ops = {
.type = &nft_tunnel_type,
@@ -131,10 +156,12 @@ static const struct nft_expr_ops nft_tunnel_get_ops = {
.eval = nft_tunnel_get_eval,
.init = nft_tunnel_get_init,
.dump = nft_tunnel_get_dump,
+ .reduce = nft_tunnel_get_reduce,
};
static struct nft_expr_type nft_tunnel_type __read_mostly = {
.name = "tunnel",
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_get_ops,
.policy = nft_tunnel_policy,
.maxattr = NFTA_TUNNEL_MAX,
@@ -147,8 +174,8 @@ struct nft_tunnel_opts {
struct erspan_metadata erspan;
u8 data[IP_TUNNEL_OPTS_MAX];
} u;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
u32 len;
- __be16 flags;
};
struct nft_tunnel_obj {
@@ -244,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
opts->len = sizeof(struct vxlan_metadata);
- opts->flags = TUNNEL_VXLAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags);
return 0;
}
@@ -298,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
opts->u.erspan.version = version;
opts->len = sizeof(struct erspan_metadata);
- opts->flags = TUNNEL_ERSPAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags);
return 0;
}
@@ -306,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
[NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
[NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
- [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
};
static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
struct nft_tunnel_opts *opts)
{
- struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len);
struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
int err, data_len;
@@ -339,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
opt->length = data_len / 4;
opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
- opts->flags = TUNNEL_GENEVE_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags);
return 0;
}
@@ -357,8 +387,9 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- int err, rem, type = 0;
struct nlattr *nla;
+ int err, rem;
+ u32 type = 0;
err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
nft_tunnel_opts_policy, NULL);
@@ -373,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
err = nft_tunnel_obj_vxlan_init(nla, opts);
if (err)
return err;
- type = TUNNEL_VXLAN_OPT;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
break;
case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
if (type)
@@ -381,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
err = nft_tunnel_obj_erspan_init(nla, opts);
if (err)
return err;
- type = TUNNEL_ERSPAN_OPT;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
break;
case NFTA_TUNNEL_KEY_OPTS_GENEVE:
- if (type && type != TUNNEL_GENEVE_OPT)
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
return -EINVAL;
err = nft_tunnel_obj_geneve_init(nla, opts);
if (err)
return err;
- type = TUNNEL_GENEVE_OPT;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
break;
default:
return -EOPNOTSUPP;
@@ -426,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
memset(&info, 0, sizeof(info));
info.mode = IP_TUNNEL_INFO_TX;
info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
- info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags);
if (tb[NFTA_TUNNEL_KEY_IP]) {
err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
@@ -455,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
- info.key.tun_flags &= ~TUNNEL_CSUM;
+ __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
- info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+ info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
- info.key.tun_flags |= TUNNEL_SEQ;
+ __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags);
}
if (tb[NFTA_TUNNEL_KEY_TOS])
info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
- if (tb[NFTA_TUNNEL_KEY_TTL])
- info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
- else
- info.key.ttl = U8_MAX;
+ info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX);
if (tb[NFTA_TUNNEL_KEY_OPTS]) {
err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
@@ -475,13 +506,14 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return err;
}
- md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL,
+ GFP_KERNEL_ACCOUNT);
if (!md)
return -ENOMEM;
memcpy(&md->u.tun_info, &info, sizeof(info));
#ifdef CONFIG_DST_CACHE
- err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL);
+ err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT);
if (err < 0) {
metadata_dst_free(md);
return err;
@@ -555,7 +587,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
if (!nest)
return -1;
- if (opts->flags & TUNNEL_VXLAN_OPT) {
+ if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) {
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN);
if (!inner)
goto failure;
@@ -563,7 +595,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
htonl(opts->u.vxlan.gbp)))
goto inner_failure;
nla_nest_end(skb, inner);
- } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) {
inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN);
if (!inner)
goto failure;
@@ -585,15 +617,15 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
break;
}
nla_nest_end(skb, inner);
- } else if (opts->flags & TUNNEL_GENEVE_OPT) {
+ } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) {
struct geneve_opt *opt;
int offset = 0;
- inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
- if (!inner)
- goto failure;
while (opts->len > offset) {
- opt = (struct geneve_opt *)opts->u.data + offset;
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ opt = (struct geneve_opt *)(opts->u.data + offset);
if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
opt->opt_class) ||
nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
@@ -602,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
opt->length * 4, opt->opt_data))
goto inner_failure;
offset += sizeof(*opt) + opt->length * 4;
+ nla_nest_end(skb, inner);
}
- nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
@@ -630,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb,
{
u32 flags = 0;
- if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
- if (!(info->key.tun_flags & TUNNEL_CSUM))
+ if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
- if (info->key.tun_flags & TUNNEL_SEQ)
+ if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_SEQ_NUMBER;
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
@@ -685,6 +717,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = {
static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
.type = NFT_OBJECT_TUNNEL,
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_obj_ops,
.maxattr = NFTA_TUNNEL_KEY_MAX,
.policy = nft_tunnel_key_policy,
@@ -719,3 +752,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("tunnel");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
+MODULE_DESCRIPTION("nftables tunnel expression support");
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 06d5cabf1d7c..3210cfc966ab 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -16,17 +16,18 @@
#include <net/xfrm.h>
static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
- [NFTA_XFRM_KEY] = { .type = NLA_U32 },
+ [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DIR] = { .type = NLA_U8 },
- [NFTA_XFRM_SPNUM] = { .type = NLA_U32 },
+ [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DREG] = { .type = NLA_U32 },
};
struct nft_xfrm {
enum nft_xfrm_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
u8 dir;
u8 spnum;
+ u8 len;
};
static int nft_xfrm_get_init(const struct nft_ctx *ctx,
@@ -50,7 +51,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY]));
switch (priv->key) {
case NFT_XFRM_KEY_REQID:
case NFT_XFRM_KEY_SPI:
@@ -86,9 +87,9 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
priv->spnum = spnum;
- priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
/* Return true if key asks for daddr/saddr and current
@@ -111,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
return true;
}
- return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
+ mode == XFRM_MODE_IPTFS;
}
static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
@@ -133,13 +135,13 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
WARN_ON_ONCE(1);
break;
case NFT_XFRM_KEY_DADDR_IP4:
- *dest = state->id.daddr.a4;
+ *dest = (__force __u32)state->id.daddr.a4;
return;
case NFT_XFRM_KEY_DADDR_IP6:
memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
return;
case NFT_XFRM_KEY_SADDR_IP4:
- *dest = state->props.saddr.a4;
+ *dest = (__force __u32)state->props.saddr.a4;
return;
case NFT_XFRM_KEY_SADDR_IP6:
memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
@@ -148,7 +150,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
*dest = state->props.reqid;
return;
case NFT_XFRM_KEY_SPI:
- *dest = state->id.spi;
+ *dest = (__force __u32)state->id.spi;
return;
}
@@ -211,7 +213,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
}
static int nft_xfrm_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
@@ -228,12 +230,16 @@ static int nft_xfrm_get_dump(struct sk_buff *skb,
return 0;
}
-static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->dir) {
case XFRM_POLICY_IN:
hooks = (1 << NF_INET_FORWARD) |
@@ -253,6 +259,31 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+static bool nft_xfrm_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ const struct nft_xfrm *xfrm;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ xfrm = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != xfrm->key ||
+ priv->dreg != xfrm->dreg ||
+ priv->dir != xfrm->dir ||
+ priv->spnum != xfrm->spnum) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
static struct nft_expr_type nft_xfrm_type;
static const struct nft_expr_ops nft_xfrm_get_ops = {
@@ -262,6 +293,7 @@ static const struct nft_expr_ops nft_xfrm_get_ops = {
.init = nft_xfrm_get_init,
.dump = nft_xfrm_get_dump,
.validate = nft_xfrm_validate,
+ .reduce = nft_xfrm_reduce,
};
static struct nft_expr_type nft_xfrm_type __read_mostly = {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 51b454d8fa9c..008419db815a 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -25,7 +25,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP)
skb->csum = 0;
@@ -51,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
skb->len - dataoff, 0);
@@ -79,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = ~csum_unfold(
csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -106,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip6_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
hsum = skb_checksum(skb, 0, dataoff, 0);
skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
@@ -179,39 +179,54 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
}
EXPORT_SYMBOL_GPL(nf_route);
-static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+/* Only get and check the lengths, not do any hop-by-hop stuff. */
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
{
-#ifdef CONFIG_INET
- const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
- if (entry->state.hook == NF_INET_LOCAL_OUT) {
- const struct iphdr *iph = ip_hdr(skb);
-
- if (!(iph->tos == rt_info->tos &&
- skb->mark == rt_info->mark &&
- iph->daddr == rt_info->daddr &&
- iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, skb,
- RTN_UNSPEC);
+ int len, off = sizeof(struct ipv6hdr);
+ unsigned char *nh;
+
+ if (!pskb_may_pull(skb, off + 8))
+ return -ENOMEM;
+ nh = (unsigned char *)(ipv6_hdr(skb) + 1);
+ len = (nh[1] + 1) << 3;
+
+ if (!pskb_may_pull(skb, off + len))
+ return -ENOMEM;
+ nh = skb_network_header(skb);
+
+ off += 2;
+ len -= 2;
+ while (len > 0) {
+ int optlen;
+
+ if (nh[off] == IPV6_TLV_PAD1) {
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ return -EBADMSG;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ return -EBADMSG;
+
+ if (nh[off] == IPV6_TLV_JUMBO) {
+ u32 pkt_len;
+
+ if (nh[off + 1] != 4 || (off & 3) != 2)
+ return -EBADMSG;
+ pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+ if (pkt_len <= IPV6_MAXPLEN ||
+ ipv6_hdr(skb)->payload_len)
+ return -EBADMSG;
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+ return -EBADMSG;
+ *plen = pkt_len;
+ }
+ off += optlen;
+ len -= optlen;
}
-#endif
- return 0;
-}
-
-int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
-{
- const struct nf_ipv6_ops *v6ops;
- int ret = 0;
- switch (entry->state.pf) {
- case AF_INET:
- ret = nf_ip_reroute(skb, entry);
- break;
- case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- ret = v6ops->reroute(skb, entry);
- break;
- }
- return ret;
+ return len ? -EBADMSG : 0;
}
+EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 99a468be4a59..90b7630421c4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -24,6 +24,7 @@
#include <linux/audit.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp.h>
@@ -38,6 +39,24 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define XT_PCPU_BLOCK_SIZE 4096
#define XT_MAX_TABLE_SIZE (512 * 1024 * 1024)
+struct xt_template {
+ struct list_head list;
+
+ /* called when table is needed in the given netns */
+ int (*table_init)(struct net *net);
+
+ struct module *me;
+
+ /* A unique name... */
+ char name[XT_TABLE_MAXNAMELEN];
+};
+
+static struct list_head xt_templates[NFPROTO_NUMPROTO];
+
+struct xt_pernet {
+ struct list_head tables[NFPROTO_NUMPROTO];
+};
+
struct compat_delta {
unsigned int offset; /* offset in kernel */
int delta; /* delta in 32bit user land */
@@ -47,7 +66,7 @@ struct xt_af {
struct mutex mutex;
struct list_head match;
struct list_head target;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct mutex compat_mutex;
struct compat_delta *compat_tab;
unsigned int number; /* number of slots in compat_tab[] */
@@ -55,7 +74,8 @@ struct xt_af {
#endif
};
-static struct xt_af *xt;
+static unsigned int xt_pernet_id __read_mostly;
+static struct xt_af *xt __read_mostly;
static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_UNSPEC] = "x",
@@ -330,6 +350,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_match *m;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(m, &xt[af].match, list) {
if (strcmp(m->name, name) == 0) {
if (m->revision > *bestp)
@@ -338,6 +359,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return match_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -350,6 +372,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_target *t;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &xt[af].target, list) {
if (strcmp(t->name, name) == 0) {
if (t->revision > *bestp)
@@ -358,6 +381,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return target_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -371,12 +395,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
{
int have_rev, best = -1;
- mutex_lock(&xt[af].mutex);
if (target == 1)
have_rev = target_revfn(af, name, revision, &best);
else
have_rev = match_revfn(af, name, revision, &best);
- mutex_unlock(&xt[af].mutex);
/* Nothing at all? Return 0 to try loading module. */
if (best == -1) {
@@ -639,7 +661,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
return usersize == kernsize && strnlen(msg, msglen) < msglen;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
struct xt_af *xp = &xt[af];
@@ -731,7 +753,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
{
const struct xt_match *match = m->u.kernel.match;
struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
- int pad, off = xt_compat_match_offset(match);
+ int off = xt_compat_match_offset(match);
u_int16_t msize = cm->u.user.match_size;
char name[sizeof(m->u.user.name)];
@@ -741,15 +763,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
match->compat_from_user(m->data, cm->data);
else
memcpy(m->data, cm->data, msize - sizeof(*cm));
- pad = XT_ALIGN(match->matchsize) - match->matchsize;
- if (pad > 0)
- memset(m->data + match->matchsize, 0, pad);
msize += off;
m->u.user.match_size = msize;
- strlcpy(name, match->name, sizeof(name));
+ strscpy(name, match->name, sizeof(name));
module_put(match->me);
- strncpy(m->u.user.name, name, sizeof(m->u.user.name));
+ strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name));
*size += off;
*dstptr += msize;
@@ -845,7 +864,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
__alignof__(struct compat_xt_entry_match));
}
EXPORT_SYMBOL(xt_compat_check_entry_offsets);
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
/**
* xt_check_entry_offsets - validate arp/ip/ip6t_entry
@@ -863,7 +882,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
* match structures are aligned, and that the last structure ends where
* the target structure begins.
*
- * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version.
*
* The arp/ip/ip6t_entry structure @base must have passed following tests:
* - it must point to a valid memory location
@@ -1028,34 +1047,34 @@ int xt_check_target(struct xt_tgchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_target);
/**
- * xt_copy_counters_from_user - copy counters and metadata from userspace
+ * xt_copy_counters - copy counters and metadata from a sockptr_t
*
- * @user: src pointer to userspace memory
+ * @arg: src sockptr
* @len: alleged size of userspace memory
* @info: where to store the xt_counters_info metadata
- * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
*
* Copies counter meta data from @user and stores it in @info.
*
* vmallocs memory to hold the counters, then copies the counter data
* from @user to the new memory and returns a pointer to it.
*
- * If @compat is true, @info gets converted automatically to the 64bit
- * representation.
+ * If called from a compat syscall, @info gets converted automatically to the
+ * 64bit representation.
*
* The metadata associated with the counters is stored in @info.
*
* Return: returns pointer that caller has to test via IS_ERR().
* If IS_ERR is false, caller has to vfree the pointer.
*/
-void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
- struct xt_counters_info *info, bool compat)
+void *xt_copy_counters(sockptr_t arg, unsigned int len,
+ struct xt_counters_info *info)
{
+ size_t offset;
void *mem;
u64 size;
-#ifdef CONFIG_COMPAT
- if (compat) {
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall()) {
/* structures only differ in size due to alignment */
struct compat_xt_counters_info compat_tmp;
@@ -1063,12 +1082,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(compat_tmp);
- if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+ if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0)
return ERR_PTR(-EFAULT);
memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
info->num_counters = compat_tmp.num_counters;
- user += sizeof(compat_tmp);
+ offset = sizeof(compat_tmp);
} else
#endif
{
@@ -1076,10 +1095,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(*info);
- if (copy_from_user(info, user, sizeof(*info)) != 0)
+ if (copy_from_sockptr(info, arg, sizeof(*info)) != 0)
return ERR_PTR(-EFAULT);
- user += sizeof(*info);
+ offset = sizeof(*info);
}
info->name[sizeof(info->name) - 1] = '\0';
@@ -1093,15 +1112,15 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
if (!mem)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(mem, user, len) == 0)
+ if (copy_from_sockptr_offset(mem, arg, offset, len) == 0)
return mem;
vfree(mem);
return ERR_PTR(-EFAULT);
}
-EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+EXPORT_SYMBOL_GPL(xt_copy_counters);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_target_offset(const struct xt_target *target)
{
u_int16_t csize = target->compatsize ? : target->targetsize;
@@ -1114,7 +1133,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
{
const struct xt_target *target = t->u.kernel.target;
struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
- int pad, off = xt_compat_target_offset(target);
+ int off = xt_compat_target_offset(target);
u_int16_t tsize = ct->u.user.target_size;
char name[sizeof(t->u.user.name)];
@@ -1123,16 +1142,14 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
if (target->compat_from_user)
target->compat_from_user(t->data, ct->data);
else
- memcpy(t->data, ct->data, tsize - sizeof(*ct));
- pad = XT_ALIGN(target->targetsize) - target->targetsize;
- if (pad > 0)
- memset(t->data + target->targetsize, 0, pad);
+ unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct),
+ /* UAPI 0-sized destination */);
tsize += off;
t->u.user.target_size = tsize;
- strlcpy(name, target->name, sizeof(name));
+ strscpy(name, target->name, sizeof(name));
module_put(target->me);
- strncpy(t->u.user.name, name, sizeof(t->u.user.name));
+ strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name));
*size += off;
*dstptr += tsize;
@@ -1197,50 +1214,65 @@ void xt_free_table_info(struct xt_table_info *info)
}
EXPORT_SYMBOL(xt_free_table_info);
+struct xt_table *xt_find_table(struct net *net, u8 af, const char *name)
+{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *t;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_net->tables[af], list) {
+ if (strcmp(t->name, name) == 0) {
+ mutex_unlock(&xt[af].mutex);
+ return t;
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+ return NULL;
+}
+EXPORT_SYMBOL(xt_find_table);
+
/* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
- struct xt_table *t, *found = NULL;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct module *owner = NULL;
+ struct xt_template *tmpl;
+ struct xt_table *t;
mutex_lock(&xt[af].mutex);
- list_for_each_entry(t, &net->xt.tables[af], list)
+ list_for_each_entry(t, &xt_net->tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
- if (net == &init_net)
- goto out;
-
- /* Table doesn't exist in this netns, re-try init */
- list_for_each_entry(t, &init_net.xt.tables[af], list) {
+ /* Table doesn't exist in this netns, check larval list */
+ list_for_each_entry(tmpl, &xt_templates[af], list) {
int err;
- if (strcmp(t->name, name))
+ if (strcmp(tmpl->name, name))
continue;
- if (!try_module_get(t->me))
+ if (!try_module_get(tmpl->me))
goto out;
+
+ owner = tmpl->me;
+
mutex_unlock(&xt[af].mutex);
- err = t->table_init(net);
+ err = tmpl->table_init(net);
if (err < 0) {
- module_put(t->me);
+ module_put(owner);
return ERR_PTR(err);
}
- found = t;
-
mutex_lock(&xt[af].mutex);
break;
}
- if (!found)
- goto out;
-
/* and once again: */
- list_for_each_entry(t, &net->xt.tables[af], list)
- if (strcmp(t->name, name) == 0)
+ list_for_each_entry(t, &xt_net->tables[af], list)
+ if (strcmp(t->name, name) == 0 && owner == t->me)
return t;
- module_put(found->me);
+ module_put(owner);
out:
mutex_unlock(&xt[af].mutex);
return ERR_PTR(-ENOENT);
@@ -1271,7 +1303,7 @@ void xt_table_unlock(struct xt_table *table)
}
EXPORT_SYMBOL_GPL(xt_table_unlock);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
void xt_compat_lock(u_int8_t af)
{
mutex_lock(&xt[af].compat_mutex);
@@ -1285,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif
-DEFINE_PER_CPU(seqcount_t, xt_recseq);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
-
struct static_key xt_tee_enabled __read_mostly;
EXPORT_SYMBOL_GPL(xt_tee_enabled);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
@@ -1387,7 +1420,7 @@ xt_replace_table(struct xt_table *table,
table->private = newinfo;
/* make sure all cpus see new ->private value */
- smp_wmb();
+ smp_mb();
/*
* Even though table entries have now been swapped, other CPU's
@@ -1410,7 +1443,8 @@ xt_replace_table(struct xt_table *table,
audit_log_nfcfg(table->name, table->af, private->number,
!private->number ? AUDIT_XT_OP_REGISTER :
- AUDIT_XT_OP_REPLACE);
+ AUDIT_XT_OP_REPLACE,
+ GFP_KERNEL);
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1420,9 +1454,10 @@ struct xt_table *xt_register_table(struct net *net,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
- int ret;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
struct xt_table_info *private;
struct xt_table *t, *table;
+ int ret;
/* Don't add one object to multiple lists. */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
@@ -1433,7 +1468,7 @@ struct xt_table *xt_register_table(struct net *net,
mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
- list_for_each_entry(t, &net->xt.tables[table->af], list) {
+ list_for_each_entry(t, &xt_net->tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
goto unlock;
@@ -1452,7 +1487,7 @@ struct xt_table *xt_register_table(struct net *net,
/* save number of initial entries */
private->initial_entries = private->number;
- list_add(&table->list, &net->xt.tables[table->af]);
+ list_add(&table->list, &xt_net->tables[table->af]);
mutex_unlock(&xt[table->af].mutex);
return table;
@@ -1473,34 +1508,42 @@ void *xt_unregister_table(struct xt_table *table)
list_del(&table->list);
mutex_unlock(&xt[table->af].mutex);
audit_log_nfcfg(table->name, table->af, private->number,
- AUDIT_XT_OP_UNREGISTER);
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ kfree(table->ops);
kfree(table);
return private;
}
EXPORT_SYMBOL_GPL(xt_unregister_table);
+#endif
#ifdef CONFIG_PROC_FS
static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
+
+ xt_net = net_generic(net, xt_pernet_id);
mutex_lock(&xt[af].mutex);
- return seq_list_start(&net->xt.tables[af], *pos);
+ return seq_list_start(&xt_net->tables[af], *pos);
}
static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
- return seq_list_next(v, &net->xt.tables[af], pos);
+ xt_net = net_generic(net, xt_pernet_id);
+
+ return seq_list_next(v, &xt_net->tables[af], pos);
}
static void xt_table_seq_stop(struct seq_file *seq, void *v)
{
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u_int8_t af = (unsigned long)pde_data(file_inode(seq->file));
mutex_unlock(&xt[af].mutex);
}
@@ -1544,7 +1587,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
[MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
};
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
if (ppos != NULL)
@@ -1571,7 +1614,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
trav->curr = trav->curr->next;
if (trav->curr != trav->head)
break;
- /* fall through */
+ fallthrough;
default:
return NULL;
}
@@ -1593,7 +1636,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
{
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
switch (trav->class) {
@@ -1718,6 +1761,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
}
EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
+int xt_register_template(const struct xt_table *table,
+ int (*table_init)(struct net *net))
+{
+ int ret = -EEXIST, af = table->af;
+ struct xt_template *t;
+
+ mutex_lock(&xt[af].mutex);
+
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0))
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out_unlock;
+
+ BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name));
+
+ strscpy(t->name, table->name, sizeof(t->name));
+ t->table_init = table_init;
+ t->me = table->me;
+ list_add(&t->list, &xt_templates[af]);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&xt[af].mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xt_register_template);
+
+void xt_unregister_template(const struct xt_table *table)
+{
+ struct xt_template *t;
+ int af = table->af;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (strcmp(table->name, t->name))
+ continue;
+
+ list_del(&t->list);
+ mutex_unlock(&xt[af].mutex);
+ kfree(t);
+ return;
+ }
+
+ mutex_unlock(&xt[af].mutex);
+ WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(xt_unregister_template);
+
int xt_proto_init(struct net *net, u_int8_t af)
{
#ifdef CONFIG_PROC_FS
@@ -1735,7 +1830,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
root_uid = make_kuid(net->user_ns, 0);
root_gid = make_kgid(net->user_ns, 0);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
sizeof(struct seq_net_private),
@@ -1745,7 +1840,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_match_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1755,7 +1850,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_target_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1770,12 +1865,12 @@ int xt_proto_init(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
out_remove_matches:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out_remove_tables:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out:
@@ -1789,21 +1884,22 @@ void xt_proto_fini(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
char buf[XT_FUNCTION_MAXNAMELEN];
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
#endif /*CONFIG_PROC_FS*/
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
/**
* xt_percpu_counter_alloc - allocate x_tables rule counter
*
@@ -1858,27 +1954,32 @@ void xt_percpu_counter_free(struct xt_counters *counters)
free_percpu((void __percpu *)pcnt);
}
EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+#endif
static int __net_init xt_net_init(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- INIT_LIST_HEAD(&net->xt.tables[i]);
+ INIT_LIST_HEAD(&xt_net->tables[i]);
return 0;
}
static void __net_exit xt_net_exit(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- WARN_ON_ONCE(!list_empty(&net->xt.tables[i]));
+ WARN_ON_ONCE(!list_empty(&xt_net->tables[i]));
}
static struct pernet_operations xt_net_ops = {
.init = xt_net_init,
.exit = xt_net_exit,
+ .id = &xt_pernet_id,
+ .size = sizeof(struct xt_pernet),
};
static int __init xt_init(void)
@@ -1886,8 +1987,10 @@ static int __init xt_init(void)
unsigned int i;
int rv;
- for_each_possible_cpu(i) {
- seqcount_init(&per_cpu(xt_recseq, i));
+ if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) {
+ for_each_possible_cpu(i) {
+ seqcount_init(&per_cpu(xt_recseq, i));
+ }
}
xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL);
@@ -1896,12 +1999,13 @@ static int __init xt_init(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++) {
mutex_init(&xt[i].mutex);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
mutex_init(&xt[i].compat_mutex);
xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
+ INIT_LIST_HEAD(&xt_templates[i]);
}
rv = register_pernet_subsys(&xt_net_ops);
if (rv < 0)
@@ -1917,4 +2021,3 @@ static void __exit xt_fini(void)
module_init(xt_init);
module_exit(xt_fini);
-
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 9cdc16b0d0d8..b6a015aee0ce 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -117,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par)
const struct xt_audit_info *info = par->targinfo;
if (info->type > XT_AUDIT_TYPE_MAX) {
- pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n",
+ pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n",
XT_AUDIT_TYPE_MAX);
return -ERANGE;
}
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index c8a639f56168..9d99f5a3d176 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par)
return 0;
}
-static struct xt_target checksum_tg_reg __read_mostly = {
- .name = "CHECKSUM",
- .family = NFPROTO_UNSPEC,
- .target = checksum_tg,
- .targetsize = sizeof(struct xt_CHECKSUM_info),
- .table = "mangle",
- .checkentry = checksum_tg_check,
- .me = THIS_MODULE,
+static struct xt_target checksum_tg_reg[] __read_mostly = {
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV4,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV6,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init checksum_tg_init(void)
{
- return xt_register_target(&checksum_tg_reg);
+ return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
static void __exit checksum_tg_exit(void)
{
- xt_unregister_target(&checksum_tg_reg);
+ xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
module_init(checksum_tg_init);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 0accac98dea7..0ae8d8a1216e 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
{
.name = "CLASSIFY",
.revision = 0,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
+ (1 << NF_INET_POST_ROUTING),
.target = classify_tg,
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
@@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init classify_tg_init(void)
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index a5c8b653476a..1494b3ee30e1 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -6,7 +6,7 @@
* with the SECMARK target and state match.
*
* Based somewhat on CONNMARK:
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
*
* (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_target connsecmark_tg_reg __read_mostly = {
- .name = "CONNSECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = connsecmark_tg_check,
- .destroy = connsecmark_tg_destroy,
- .target = connsecmark_tg,
- .targetsize = sizeof(struct xt_connsecmark_target_info),
- .me = THIS_MODULE,
+static struct xt_target connsecmark_tg_reg[] __read_mostly = {
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connsecmark_tg_init(void)
{
- return xt_register_target(&connsecmark_tg_reg);
+ return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
static void __exit connsecmark_tg_exit(void)
{
- xt_unregister_target(&connsecmark_tg_reg);
+ xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
module_init(connsecmark_tg_init);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index d4deee39158b..3ba94c34297c 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
return XT_CONTINUE;
if (ct) {
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
} else {
nf_ct_set(skb, ct, IP_CT_UNTRACKED);
@@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOMEM;
}
- help->helper = helper;
+ rcu_assign_pointer(help->helper, helper);
return 0;
}
@@ -136,6 +136,21 @@ static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
}
}
+static void xt_ct_put_helper(struct nf_conn_help *help)
+{
+ struct nf_conntrack_helper *helper;
+
+ if (!help)
+ return;
+
+ /* not yet exposed to other cpus, or ruleset
+ * already detached (post-replacement).
+ */
+ helper = rcu_dereference_raw(help->helper);
+ if (helper)
+ nf_conntrack_helper_put(helper);
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
@@ -172,7 +187,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err2;
}
- ret = 0;
if ((info->ct_events || info->exp_events) &&
!nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
GFP_KERNEL)) {
@@ -202,15 +216,13 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err4;
}
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err4:
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -272,8 +284,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
nf_ct_netns_put(par->net, par->family);
@@ -302,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
xt_ct_tg_destroy(par, par->targinfo);
}
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->_nfct != 0)
+ return XT_CONTINUE;
+
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return XT_CONTINUE;
+}
+
static struct xt_target xt_ct_tg_reg[] __read_mostly = {
{
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.targetsize = sizeof(struct xt_ct_target_info),
.usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v0,
@@ -316,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -328,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -338,60 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.table = "raw",
.me = THIS_MODULE,
},
-};
-
-static unsigned int
-notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- /* Previously seen (loopback)? Ignore. */
- if (skb->_nfct != 0)
- return XT_CONTINUE;
-
- nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-
- return XT_CONTINUE;
-}
-
-static int notrack_chk(const struct xt_tgchk_param *par)
-{
- if (!par->net->xt.notrack_deprecated_warning) {
- pr_info("netfilter: NOTRACK target is deprecated, "
- "use CT instead or upgrade iptables\n");
- par->net->xt.notrack_deprecated_warning = true;
- }
- return 0;
-}
-
-static struct xt_target notrack_tg_reg __read_mostly = {
- .name = "NOTRACK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = notrack_chk,
- .target = notrack_tg,
- .table = "raw",
- .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .targetsize = sizeof(struct xt_ct_target_info),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v0,
+ .destroy = xt_ct_tg_destroy_v0,
+ .target = xt_ct_target_v0,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v1,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 2,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v2,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_ct_tg_init(void)
{
- int ret;
-
- ret = xt_register_target(&notrack_tg_reg);
- if (ret < 0)
- return ret;
-
- ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- if (ret < 0) {
- xt_unregister_target(&notrack_tg_reg);
- return ret;
- }
- return 0;
+ return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
}
static void __exit xt_ct_tg_exit(void)
{
xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- xt_unregister_target(&notrack_tg_reg);
}
module_init(xt_ct_tg_init);
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index eababc354ff1..cfa44515ab72 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -24,6 +24,8 @@ MODULE_ALIAS("ip6t_DSCP");
MODULE_ALIAS("ipt_TOS");
MODULE_ALIAS("ip6t_TOS");
+#define XT_DSCP_ECN_MASK 3u
+
static unsigned int
dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -34,8 +36,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
- ipv4_change_dsfield(ip_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
@@ -52,8 +53,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
- ipv6_change_dsfield(ipv6_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index 713fb38541df..8928ec56c388 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -276,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
return 0;
/* follow-up fragments don't contain ports, skip all fragments */
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ if (ip_is_fragment(ip))
return 0;
hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 7b2f359bfce4..d73957592c9d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -85,9 +85,9 @@ static ssize_t idletimer_tg_show(struct device *dev,
mutex_unlock(&list_mutex);
if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
- return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff);
+ return sysfs_emit(buf, "%ld\n", time_diff);
- return snprintf(buf, PAGE_SIZE, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -100,21 +100,19 @@ static void idletimer_tg_work(struct work_struct *work)
static void idletimer_tg_expired(struct timer_list *t)
{
- struct idletimer_tg *timer = from_timer(timer, t, timer);
+ struct idletimer_tg *timer = timer_container_of(timer, t, timer);
pr_debug("timer %s expired\n", timer->attr.attr.name);
schedule_work(&timer->work);
}
-static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
- ktime_t now)
+static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now)
{
struct idletimer_tg *timer = alarm->data;
pr_debug("alarm %s expired\n", timer->attr.attr.name);
schedule_work(&timer->work);
- return ALARMTIMER_NORESTART;
}
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
@@ -137,7 +135,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
{
int ret;
- info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL);
if (!info->timer) {
ret = -ENOMEM;
goto out;
@@ -170,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
INIT_WORK(&info->timer->work, idletimer_tg_work);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return 0;
@@ -231,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
} else {
timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return 0;
@@ -256,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
info->label, info->timeout);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return XT_CONTINUE;
}
@@ -277,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
alarm_start_relative(&info->timer->alarm, tout);
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
return XT_CONTINUE;
@@ -322,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
if (info->timer) {
info->timer->refcnt++;
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -384,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
}
} else {
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
}
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -409,21 +407,23 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_lock(&list_mutex);
- if (--info->timer->refcnt == 0) {
- pr_debug("deleting timer %s\n", info->label);
-
- list_del(&info->timer->entry);
- del_timer_sync(&info->timer->timer);
- cancel_work_sync(&info->timer->work);
- sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
- kfree(info->timer->attr.attr.name);
- kfree(info->timer);
- } else {
+ if (--info->timer->refcnt > 0) {
pr_debug("decreased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
}
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
mutex_unlock(&list_mutex);
+
+ timer_shutdown_sync(&info->timer->timer);
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
}
static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
@@ -434,52 +434,75 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
mutex_lock(&list_mutex);
- if (--info->timer->refcnt == 0) {
- pr_debug("deleting timer %s\n", info->label);
-
- list_del(&info->timer->entry);
- if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
- alarm_cancel(&info->timer->alarm);
- } else {
- del_timer_sync(&info->timer->timer);
- }
- cancel_work_sync(&info->timer->work);
- sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
- kfree(info->timer->attr.attr.name);
- kfree(info->timer);
- } else {
+ if (--info->timer->refcnt > 0) {
pr_debug("decreased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
}
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
mutex_unlock(&list_mutex);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ timer_shutdown_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
}
static struct xt_target idletimer_tg[] __read_mostly = {
{
- .name = "IDLETIMER",
- .family = NFPROTO_UNSPEC,
- .target = idletimer_tg_target,
- .targetsize = sizeof(struct idletimer_tg_info),
- .usersize = offsetof(struct idletimer_tg_info, timer),
- .checkentry = idletimer_tg_checkentry,
- .destroy = idletimer_tg_destroy,
- .me = THIS_MODULE,
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
},
{
- .name = "IDLETIMER",
- .family = NFPROTO_UNSPEC,
- .revision = 1,
- .target = idletimer_tg_target_v1,
- .targetsize = sizeof(struct idletimer_tg_info_v1),
- .usersize = offsetof(struct idletimer_tg_info_v1, timer),
- .checkentry = idletimer_tg_checkentry_v1,
- .destroy = idletimer_tg_destroy_v1,
- .me = THIS_MODULE,
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
},
-
-
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct class *idletimer_tg_class;
@@ -490,7 +513,7 @@ static int __init idletimer_tg_init(void)
{
int err;
- idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+ idletimer_tg_class = class_create("xt_idletimer");
err = PTR_ERR(idletimer_tg_class);
if (IS_ERR(idletimer_tg_class)) {
pr_debug("couldn't register device class\n");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index 0371c387b0d1..90dcf088071a 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -43,7 +43,6 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_led_info *ledinfo = par->targinfo;
struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
- unsigned long led_delay = XT_LED_BLINK_DELAY;
/*
* If "always blink" is enabled, and there's still some time until the
@@ -52,7 +51,7 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
if ((ledinfo->delay > 0) && ledinfo->always_blink &&
timer_pending(&ledinternal->timer))
led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger,
- &led_delay, &led_delay, 1);
+ XT_LED_BLINK_DELAY, XT_LED_BLINK_DELAY, 1);
else
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
@@ -73,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
static void led_timeout_callback(struct timer_list *t)
{
- struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t,
- timer);
+ struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal,
+ t,
+ timer);
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
}
@@ -97,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par)
struct xt_led_info_internal *ledinternal;
int err;
- if (ledinfo->id[0] == '\0')
+ /* Bail out if empty string or not a string at all. */
+ if (ledinfo->id[0] == '\0' ||
+ !memchr(ledinfo->id, '\0', sizeof(ledinfo->id)))
return -EINVAL;
mutex_lock(&xt_led_mutex);
@@ -166,7 +168,7 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&ledinternal->list);
- del_timer_sync(&ledinternal->timer);
+ timer_shutdown_sync(&ledinternal->timer);
led_trigger_unregister(&ledinternal->netfilter_led_trigger);
@@ -176,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par)
kfree(ledinternal);
}
-static struct xt_target led_tg_reg __read_mostly = {
- .name = "LED",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = led_tg,
- .targetsize = sizeof(struct xt_led_info),
- .usersize = offsetof(struct xt_led_info, internal_data),
- .checkentry = led_tg_check,
- .destroy = led_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target led_tg_reg[] __read_mostly = {
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init led_tg_init(void)
{
- return xt_register_target(&led_tg_reg);
+ return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
static void __exit led_tg_exit(void)
{
- xt_unregister_target(&led_tg_reg);
+ xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
module_init(led_tg_init);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index a1e79b517c01..f39244f9c0ed 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int log_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
+ int ret;
if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
return -EINVAL;
@@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nf_log_syslog");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ }
+
+ return ret;
}
static void log_tg_destroy(const struct xt_tgdtor_param *par)
@@ -108,3 +116,4 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging");
MODULE_ALIAS("ipt_LOG");
MODULE_ALIAS("ip6t_LOG");
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 6e83ce3000db..6dcf4bc7e30b 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int nflog_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
+ int ret;
if (info->flags & ~XT_NFLOG_MASK)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
- return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nfnetlink_log");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ }
+
+ return ret;
}
static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
@@ -56,26 +64,41 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
}
-static struct xt_target nflog_tg_reg __read_mostly = {
- .name = "NFLOG",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = nflog_tg_check,
- .destroy = nflog_tg_destroy,
- .target = nflog_tg,
- .targetsize = sizeof(struct xt_nflog_info),
- .me = THIS_MODULE,
+static struct xt_target nflog_tg_reg[] __read_mostly = {
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init nflog_tg_init(void)
{
- return xt_register_target(&nflog_tg_reg);
+ return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
static void __exit nflog_tg_exit(void)
{
- xt_unregister_target(&nflog_tg_reg);
+ xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
module_init(nflog_tg_init);
module_exit(nflog_tg_exit);
+MODULE_SOFTDEP("pre: nfnetlink_log");
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 37253d399c6b..4f49cfc27831 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -94,11 +94,11 @@ static unsigned int
xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_rateest_target_info *info = par->targinfo;
- struct gnet_stats_basic_packed *stats = &info->est->bstats;
+ struct gnet_stats_basic_sync *stats = &info->est->bstats;
spin_lock_bh(&info->est->lock);
- stats->bytes += skb->len;
- stats->packets++;
+ u64_stats_add(&stats->bytes, skb->len);
+ u64_stats_inc(&stats->packets);
spin_unlock_bh(&info->est->lock);
return XT_CONTINUE;
@@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
+ if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+ return -ENAMETOOLONG;
+
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
mutex_lock(&xn->hash_lock);
@@ -140,7 +143,8 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
if (!est)
goto err1;
- strlcpy(est->name, info->name, sizeof(est->name));
+ gnet_stats_basic_sync_init(&est->bstats);
+ strscpy(est->name, info->name, sizeof(est->name));
spin_lock_init(&est->lock);
est->refcnt = 1;
est->params.interval = info->interval;
@@ -175,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
xt_rateest_put(par->net, info->est);
}
-static struct xt_target xt_rateest_tg_reg __read_mostly = {
- .name = "RATEEST",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = xt_rateest_tg,
- .checkentry = xt_rateest_tg_checkentry,
- .destroy = xt_rateest_tg_destroy,
- .targetsize = sizeof(struct xt_rateest_target_info),
- .usersize = offsetof(struct xt_rateest_target_info, est),
- .me = THIS_MODULE,
+static struct xt_target xt_rateest_tg_reg[] __read_mostly = {
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#endif
};
static __net_init int xt_rateest_net_init(struct net *net)
@@ -210,12 +229,12 @@ static int __init xt_rateest_tg_init(void)
if (err)
return err;
- return xt_register_target(&xt_rateest_tg_reg);
+ return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
}
static void __exit xt_rateest_tg_fini(void)
{
- xt_unregister_target(&xt_rateest_tg_reg);
+ xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
unregister_pernet_subsys(&xt_rateest_net_ops);
}
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 353ca7801251..ff66b56a3f97 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-/* FIXME: Take multiple ranges --RR */
static int redirect_tg4_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
static unsigned int
redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range2 range = {
+ .flags = mr->range[0].flags,
+ .min_proto = mr->range[0].min,
+ .max_proto = mr->range[0].max,
+ };
+
+ return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
}
static struct xt_target redirect_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 75625d13e976..5bc5ea505eb9 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK");
static u8 mode;
static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
{
u32 secmark = 0;
- const struct xt_secmark_target_info *info = par->targinfo;
switch (mode) {
case SECMARK_MODE_SEL:
@@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
{
int err;
@@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
return 0;
}
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
{
- struct xt_secmark_target_info *info = par->targinfo;
int err;
- if (strcmp(par->table, "mangle") != 0 &&
- strcmp(par->table, "security") != 0) {
+ if (strcmp(table, "mangle") != 0 &&
+ strcmp(table, "security") != 0) {
pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
- par->table);
+ table);
return -EINVAL;
}
@@ -116,25 +115,99 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
}
}
-static struct xt_target secmark_tg_reg __read_mostly = {
- .name = "SECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = secmark_tg_check,
- .destroy = secmark_tg_destroy,
- .target = secmark_tg,
- .targetsize = sizeof(struct xt_secmark_target_info),
- .me = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .mode = info->mode,
+ };
+ int ret;
+
+ memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+ ret = secmark_tg_check(par->table, &newinfo);
+ info->secid = newinfo.secid;
+
+ return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .secid = info->secid,
+ };
+
+ return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init secmark_tg_init(void)
{
- return xt_register_target(&secmark_tg_reg);
+ return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
static void __exit secmark_tg_exit(void)
{
- xt_unregister_target(&secmark_tg_reg);
+ xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
module_init(secmark_tg_init);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 122db9fbb9f4..116a885adb3c 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -239,8 +239,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
oldlen = ipv6h->payload_len;
newlen = htons(ntohs(oldlen) + ret);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(csum_sub(skb->csum, oldlen),
- newlen);
+ skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen),
+ (__force __wsum)newlen);
ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 30e99464171b..93f064306901 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 194dc03341f3..e4bea1d346cf 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -74,18 +74,10 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
-
- pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->daddr, ntohs(hp->dest),
- &laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
return NF_DROP;
}
@@ -122,16 +114,12 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int tproto;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ if (tproto < 0)
return NF_DROP;
- }
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ if (!hp)
return NF_DROP;
- }
/* check if there's an ongoing connection on the packet
* addresses, this happens if the redirect already happened
@@ -168,19 +156,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-
- pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
-
return NF_DROP;
}
@@ -200,6 +179,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
pr_info_ratelimited("Can be used only with -p tcp or -p udp\n");
return -EINVAL;
}
+
+static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv6_disable(par->net);
+}
#endif
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
@@ -219,6 +203,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
+static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv4_disable(par->net);
+}
+
static struct xt_target tproxy_tg_reg[] __read_mostly = {
{
.name = "TPROXY",
@@ -228,6 +217,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 0,
.targetsize = sizeof(struct xt_tproxy_target_info),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -239,6 +229,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -251,6 +242,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg6_check,
+ .destroy = tproxy_tg6_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 349ab5609b1b..a642ff09fc8e 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -29,26 +29,41 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target trace_tg_reg __read_mostly = {
- .name = "TRACE",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "raw",
- .target = trace_tg,
- .checkentry = trace_tg_check,
- .destroy = trace_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target trace_tg_reg[] __read_mostly = {
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init trace_tg_init(void)
{
- return xt_register_target(&trace_tg_reg);
+ return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
static void __exit trace_tg_exit(void)
{
- xt_unregister_target(&trace_tg_reg);
+ xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
module_init(trace_tg_init);
module_exit(trace_tg_exit);
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index e9b2181e8c42..a77088943107 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
},
{
.name = "addrtype",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "addrtype",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct xt_addrtype_info_v1),
+ .me = THIS_MODULE
+ },
+#endif
};
static int __init addrtype_mt_init(void)
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 13cf3f9b5938..849ac552a154 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
- return BPF_PROG_RUN(info->filter, skb);
+ return bpf_prog_run(info->filter, skb);
}
static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index c0f5e9a4f3c6..c437fbd59ec1 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching");
MODULE_ALIAS("ipt_cgroup");
MODULE_ALIAS("ip6t_cgroup");
+#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n"
+
static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
{
struct xt_cgroup_info_v0 *info = par->matchinfo;
@@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
if (info->invert & ~1)
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
return 0;
}
@@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
+#ifdef CONFIG_CGROUP_NET_CLASSID
const struct xt_cgroup_info_v0 *info = par->matchinfo;
struct sock *sk = skb->sk;
@@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
info->invert;
+#endif
+ return false;
}
static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
@@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
@@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index a047a545371e..908fd5f2c3c8 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match xt_cluster_match __read_mostly = {
- .name = "cluster",
- .family = NFPROTO_UNSPEC,
- .match = xt_cluster_mt,
- .checkentry = xt_cluster_mt_checkentry,
- .matchsize = sizeof(struct xt_cluster_match_info),
- .destroy = xt_cluster_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match xt_cluster_match[] __read_mostly = {
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV4,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV6,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_cluster_mt_init(void)
{
- return xt_register_match(&xt_cluster_match);
+ return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
static void __exit xt_cluster_mt_fini(void)
{
- xt_unregister_match(&xt_cluster_match);
+ xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 93cb018c3055..2aabdcea8707 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family);
- if (ret < 0)
+ if (ret < 0) {
pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
par->family);
+ return ret;
+ }
/*
* This filter cannot function correctly unless connection tracking
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 46fcac75f726..848287ab79cf 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = xt_net(par);
const struct xt_connlimit_info *info = par->matchinfo;
- struct nf_conntrack_tuple tuple;
- const struct nf_conntrack_tuple *tuple_ptr = &tuple;
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
@@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
u32 key[5];
ct = nf_ct_get(skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if (ct)
zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- xt_family(par), net, &tuple)) {
- goto hotdrop;
- }
if (xt_family(par) == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -62,17 +55,16 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
key[4] = zone->id;
} else {
const struct iphdr *iph = ip_hdr(skb);
- key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
- iph->daddr : iph->saddr;
- key[0] &= info->mask.ip;
+ key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
+ (__force __u32)iph->daddr : (__force __u32)iph->saddr;
+ key[0] &= (__force __u32)info->mask.ip;
key[1] = zone->id;
}
- connections = nf_conncount_count(net, info->data, key, tuple_ptr,
- zone);
+ connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key);
if (connections == 0)
- /* kmalloc failed, drop it entirely */
+ /* kmalloc failed or tuple couldn't be found, drop it entirely */
goto hotdrop;
return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
@@ -86,6 +78,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
{
struct xt_connlimit_info *info = par->matchinfo;
unsigned int keylen;
+ int ret;
keylen = sizeof(u32);
if (par->family == NFPROTO_IPV6)
@@ -93,8 +86,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
else
keylen += sizeof(struct in_addr);
+ ret = nf_ct_netns_get(par->net, par->family);
+ if (ret < 0) {
+ pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+
/* init private data */
- info->data = nf_conncount_init(par->net, par->family, keylen);
+ info->data = nf_conncount_init(par->net, keylen);
+ if (IS_ERR(info->data))
+ nf_ct_netns_put(par->net, par->family);
return PTR_ERR_OR_ZERO(info->data);
}
@@ -103,29 +105,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_connlimit_info *info = par->matchinfo;
- nf_conncount_destroy(par->net, par->family, info->data);
+ nf_conncount_destroy(par->net, info->data);
+ nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match connlimit_mt_reg __read_mostly = {
- .name = "connlimit",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = connlimit_mt_check,
- .match = connlimit_mt,
- .matchsize = sizeof(struct xt_connlimit_info),
- .usersize = offsetof(struct xt_connlimit_info, data),
- .destroy = connlimit_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
static void __exit connlimit_mt_exit(void)
{
- xt_unregister_match(&connlimit_mt_reg);
+ xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index eec2f3a88d73..4277084de2e7 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -2,7 +2,7 @@
/*
* xt_connmark - Netfilter module to operate on connection marks
*
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
* Jan Engelhardt <jengelh@medozas.de>
@@ -30,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
+ u_int32_t oldmark;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
@@ -37,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
switch (info->mode) {
case XT_CONNMARK_SET:
- newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+ oldmark = READ_ONCE(ct->mark);
+ newmark = (oldmark & ~info->ctmask) ^ info->ctmark;
if (info->shift_dir == D_SHIFT_RIGHT)
newmark >>= info->shift_bits;
else
newmark <<= info->shift_bits;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -55,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
else
new_targetmark <<= info->shift_bits;
- newmark = (ct->mark & ~info->ctmask) ^
+ newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^
new_targetmark;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
- new_targetmark = (ct->mark & info->ctmask);
+ new_targetmark = (READ_ONCE(ct->mark) & info->ctmask);
if (info->shift_dir == D_SHIFT_RIGHT)
new_targetmark >>= info->shift_bits;
else
@@ -126,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL)
return false;
- return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+ return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert;
}
static int connmark_mt_check(const struct xt_mtchk_param *par)
@@ -149,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 1,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg,
.targetsize = sizeof(struct xt_connmark_tginfo1),
@@ -159,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 2,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg_v2,
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg,
+ .targetsize = sizeof(struct xt_connmark_tginfo1),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CONNMARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg_v2,
+ .targetsize = sizeof(struct xt_connmark_tginfo2),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match connmark_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9c5cfd74a0ee..3b507694e81e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -15,7 +15,6 @@
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/list.h>
@@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
if (size < 16)
size = 16;
}
- /* FIXME: don't use vmalloc() here or anywhere else -HW */
- hinfo = vmalloc(struct_size(hinfo, hash, size));
+ hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL);
if (hinfo == NULL)
return -ENOMEM;
*out_hinfo = hinfo;
@@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
/* copy match config into hashtable config */
ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
if (ret) {
- vfree(hinfo);
+ kvfree(hinfo);
return ret;
}
@@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
hinfo->rnd_initialized = false;
hinfo->name = kstrdup(name, GFP_KERNEL);
if (!hinfo->name) {
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
spin_lock_init(&hinfo->lock);
@@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
ops, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
hinfo->net = net;
@@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select
unsigned int i;
for (i = 0; i < ht->cfg.size; i++) {
+ struct hlist_head *head = &ht->hash[i];
struct dsthash_ent *dh;
struct hlist_node *n;
+ if (hlist_empty(head))
+ continue;
+
spin_lock_bh(&ht->lock);
- hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
+ hlist_for_each_entry_safe(dh, n, head, node) {
if (time_after_eq(jiffies, dh->expires) || select_all)
dsthash_free(ht, dh);
}
@@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
cancel_delayed_work_sync(&hinfo->gc_work);
htable_selective_cleanup(hinfo, true);
kfree(hinfo->name);
- vfree(hinfo);
+ kvfree(hinfo);
}
}
@@ -1052,7 +1054,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket;
spin_lock_bh(&htable->lock);
@@ -1069,7 +1071,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
*pos = ++(*bucket);
@@ -1083,7 +1085,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
if (!IS_ERR(bucket))
@@ -1125,7 +1127,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1140,7 +1142,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1155,7 +1157,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1169,7 +1171,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_show_v2(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
@@ -1183,7 +1185,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
static int dl_seq_show_v1(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
@@ -1197,7 +1199,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
static int dl_seq_show(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index 1873da3a945a..ca730cedb5d4 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -21,7 +21,7 @@ static bool
length_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
+ u32 pktlen = skb_ip_totlen(skb);
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
@@ -30,8 +30,7 @@ static bool
length_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
- sizeof(struct ipv6hdr);
+ u32 pktlen = skb->len;
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index bd1dea9c7b88..8b4fd27857f2 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -8,16 +8,14 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_limit.h>
struct xt_limit_priv {
- spinlock_t lock;
unsigned long prev;
- uint32_t credit;
+ u32 credit;
};
MODULE_LICENSE("GPL");
@@ -66,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateinfo *r = par->matchinfo;
struct xt_limit_priv *priv = r->master;
- unsigned long now = jiffies;
-
- spin_lock_bh(&priv->lock);
- priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
- if (priv->credit > r->credit_cap)
- priv->credit = r->credit_cap;
-
- if (priv->credit >= r->cost) {
- /* We're not limited. */
- priv->credit -= r->cost;
- spin_unlock_bh(&priv->lock);
- return true;
- }
-
- spin_unlock_bh(&priv->lock);
- return false;
+ unsigned long now;
+ u32 old_credit, new_credit, credit_increase = 0;
+ bool ret;
+
+ /* fastpath if there is nothing to update */
+ if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies))
+ return false;
+
+ do {
+ now = jiffies;
+ credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+ old_credit = READ_ONCE(priv->credit);
+ new_credit = old_credit;
+ new_credit += credit_increase;
+ if (new_credit > r->credit_cap)
+ new_credit = r->credit_cap;
+ if (new_credit >= r->cost) {
+ ret = true;
+ new_credit -= r->cost;
+ } else {
+ ret = false;
+ }
+ } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit);
+
+ return ret;
}
/* Precision saver. */
@@ -122,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
r->credit_cap = priv->credit; /* Credits full. */
r->cost = user2credits(r->avg);
}
- spin_lock_init(&priv->lock);
return 0;
}
@@ -134,7 +140,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par)
kfree(info->master);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_xt_rateinfo {
u_int32_t avg;
u_int32_t burst;
@@ -176,7 +182,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src)
};
return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
}
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
static struct xt_match limit_mt_reg __read_mostly = {
.name = "limit",
@@ -186,7 +192,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
.checkentry = limit_mt_check,
.destroy = limit_mt_destroy,
.matchsize = sizeof(struct xt_rateinfo),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct compat_xt_rateinfo),
.compat_from_user = limit_mt_compat_from_user,
.compat_to_user = limit_mt_compat_to_user,
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 1ad74b5920b5..59b9d04400ca 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
-static struct xt_target mark_tg_reg __read_mostly = {
- .name = "MARK",
- .revision = 2,
- .family = NFPROTO_UNSPEC,
- .target = mark_tg,
- .targetsize = sizeof(struct xt_mark_tginfo2),
- .me = THIS_MODULE,
+static struct xt_target mark_tg_reg[] __read_mostly = {
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_ARP,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match mark_mt_reg __read_mostly = {
@@ -61,12 +83,12 @@ static int __init mark_mt_init(void)
{
int ret;
- ret = xt_register_target(&mark_tg_reg);
+ ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
if (ret < 0)
return ret;
ret = xt_register_match(&mark_mt_reg);
if (ret < 0) {
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
return ret;
}
return 0;
@@ -75,7 +97,7 @@ static int __init mark_mt_init(void)
static void __exit mark_mt_exit(void)
{
xt_unregister_match(&mark_mt_reg);
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
}
module_init(mark_mt_init);
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index a8e5f6c8db7a..b4f7bbc3f3ca 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -244,3 +244,4 @@ MODULE_ALIAS("ipt_SNAT");
MODULE_ALIAS("ipt_DNAT");
MODULE_ALIAS("ip6t_SNAT");
MODULE_ALIAS("ip6t_DNAT");
+MODULE_DESCRIPTION("SNAT and DNAT targets support");
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 5aab6df74e0f..0ca1cdfc4095 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -27,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
overquota = nfnl_acct_overquota(xt_net(par), info->nfacct);
- return overquota == NFACCT_UNDERQUOTA ? false : true;
+ return overquota != NFACCT_UNDERQUOTA;
}
static int
@@ -38,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
nfacct = nfnl_acct_find_get(par->net, info->name);
if (nfacct == NULL) {
- pr_info_ratelimited("accounting object `%s' does not exists\n",
- info->name);
+ pr_info_ratelimited("accounting object `%.*s' does not exist\n",
+ NFACCT_NAME_MAX, info->name);
return -ENOENT;
}
info->nfacct = nfacct;
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index e1990baf3a3b..dc9485854002 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -71,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
MODULE_DESCRIPTION("Passive OS fingerprint matching.");
MODULE_ALIAS("ipt_osf");
MODULE_ALIAS("ip6t_osf");
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index e85ce69924ae..50332888c8d2 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -76,18 +76,23 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = sk->sk_socket->file;
- if (filp == NULL)
+ read_lock_bh(&sk->sk_callback_lock);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ if (filp == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
+ }
if (info->match & XT_OWNER_UID) {
kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
uid_lte(filp->f_cred->fsuid, uid_max)) ^
- !(info->invert & XT_OWNER_UID))
+ !(info->invert & XT_OWNER_UID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
if (info->match & XT_OWNER_GID) {
@@ -112,10 +117,13 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
}
}
- if (match ^ !(info->invert & XT_OWNER_GID))
+ if (match ^ !(info->invert & XT_OWNER_GID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
+ read_unlock_bh(&sk->sk_callback_lock);
return true;
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index ec6ed6fda96c..343e65f377d4 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
(!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
return false;
- physdev = nf_bridge_get_physindev(skb);
+ physdev = nf_bridge_get_physindev(skb, xt_net(par));
indev = physdev ? physdev->name : NULL;
if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 19bef176145e..588a5e6ad899 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -59,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil
/* retained for backwards compatibility */
static unsigned int ip_pkt_list_tot __read_mostly;
module_param(ip_pkt_list_tot, uint, 0400);
-MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)");
-#define XT_RECENT_MAX_NSTAMPS 256
+#define XT_RECENT_MAX_NSTAMPS 65536
struct recent_entry {
struct list_head list;
@@ -69,7 +69,7 @@ struct recent_entry {
union nf_inet_addr addr;
u_int16_t family;
u_int8_t ttl;
- u_int8_t index;
+ u_int16_t index;
u_int16_t nstamps;
unsigned long stamps[];
};
@@ -80,7 +80,7 @@ struct recent_table {
union nf_inet_addr mask;
unsigned int refcnt;
unsigned int entries;
- u8 nstamps_max_mask;
+ u_int16_t nstamps_max_mask;
struct list_head lru_list;
struct list_head iphash[];
};
@@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
/*
* Drop entries with timestamps older then 'time'.
*/
-static void recent_entry_reap(struct recent_table *t, unsigned long time)
+static void recent_entry_reap(struct recent_table *t, unsigned long time,
+ struct recent_entry *working, bool update)
{
struct recent_entry *e;
@@ -162,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time)
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
/*
+ * Do not reap the entry which are going to be updated.
+ */
+ if (e == working && update)
+ return;
+
+ /*
* The last time stamp is the most recent.
*/
if (time_after(time, e->stamps[e->index-1]))
@@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* info->seconds must be non-zero */
if (info->check_set & XT_RECENT_REAP)
- recent_entry_reap(t, time);
+ recent_entry_reap(t, time, e,
+ info->check_set & XT_RECENT_UPDATE && ret);
}
if (info->check_set & XT_RECENT_SET ||
@@ -543,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file)
if (st == NULL)
return -ENOMEM;
- st->table = PDE_DATA(inode);
+ st->table = pde_data(inode);
return 0;
}
@@ -551,9 +559,9 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- struct recent_table *t = PDE_DATA(file_inode(file));
+ struct recent_table *t = pde_data(file_inode(file));
struct recent_entry *e;
- char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+ char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")];
const char *c = buf;
union nf_inet_addr addr = {};
u_int16_t family;
@@ -640,7 +648,7 @@ static void __net_exit recent_proc_net_exit(struct net *net)
struct recent_table *t;
/* recent_net_exit() is called before recent_mt_destroy(). Make sure
- * that the parent xt_recent proc entry is is empty before trying to
+ * that the parent xt_recent proc entry is empty before trying to
* remove it.
*/
spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
index 68ccbe50bb1e..600060ca940a 100644
--- a/net/netfilter/xt_repldata.h
+++ b/net/netfilter/xt_repldata.h
@@ -29,7 +29,7 @@
if (tbl == NULL) \
return NULL; \
term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
- strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
+ strscpy(tbl->repl.name, info->name); \
*term = (struct type##_error)typ2##_ERROR_INIT; \
tbl->repl.valid_hooks = hook_mask; \
tbl->repl.num_entries = nhooks + 1; \
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 680015ba7cb6..b46a6a512058 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -4,7 +4,6 @@
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <net/sctp/sctp.h>
#include <linux/sctp.h>
#include <linux/netfilter/x_tables.h>
@@ -150,6 +149,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par)
{
const struct xt_sctp_info *info = par->matchinfo;
+ if (info->flag_count > ARRAY_SIZE(info->flag_info))
+ return -EINVAL;
if (info->flags & ~XT_SCTP_VALID_FLAGS)
return -EINVAL;
if (info->invflags & ~XT_SCTP_VALID_FLAGS)
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 5f973987265d..76e01f292aaf 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -216,6 +216,16 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
return 0;
}
+static void socket_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ if (par->family == NFPROTO_IPV4)
+ nf_defrag_ipv4_disable(par->net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ else if (par->family == NFPROTO_IPV6)
+ nf_defrag_ipv6_disable(par->net);
+#endif
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -231,6 +241,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.revision = 1,
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
+ .destroy = socket_mt_destroy,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -245,6 +256,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
+ .destroy = socket_mt_destroy,
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
@@ -256,6 +268,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -268,6 +281,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -280,6 +294,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -292,6 +307,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 203e24ae472c..b26c1dcfc27b 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 11ec2abf0c72..e8991130a3de 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -4,6 +4,7 @@
#include <linux/module.h>
#include <net/ip.h>
#include <linux/ipv6.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <net/udp.h>
@@ -20,6 +21,8 @@ MODULE_ALIAS("ipt_udp");
MODULE_ALIAS("ipt_tcp");
MODULE_ALIAS("ip6t_udp");
MODULE_ALIAS("ip6t_tcp");
+MODULE_ALIAS("ipt_icmp");
+MODULE_ALIAS("ip6t_icmp6");
/* Returns 1 if the port is matched by the range, 0 otherwise */
static inline bool
@@ -161,6 +164,95 @@ static int udp_mt_check(const struct xt_mtchk_param *par)
return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0;
}
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code)
+{
+ return type == test_type && code >= min_code && code <= max_code;
+}
+
+static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return (test_type == 0xFF ||
+ type_code_in_range(test_type, min_code, max_code, type, code))
+ ^ invert;
+}
+
+static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmphdr *ic;
+ struct icmphdr _icmph;
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags & IPT_ICMP_INV));
+}
+
+static bool
+icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmp6hdr *ic;
+ struct icmp6hdr _icmph;
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp6_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->icmp6_type, ic->icmp6_code,
+ !!(icmpinfo->invflags & IP6T_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static int icmp6_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+}
+
static struct xt_match tcpudp_mt_reg[] __read_mostly = {
{
.name = "tcp",
@@ -216,6 +308,24 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
.proto = IPPROTO_UDPLITE,
.me = THIS_MODULE,
},
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "icmp6",
+ .match = icmp6_match,
+ .matchsize = sizeof(struct ip6t_icmp),
+ .checkentry = icmp6_checkentry,
+ .proto = IPPROTO_ICMPV6,
+ .family = NFPROTO_IPV6,
+ .me = THIS_MODULE,
+ },
};
static int __init tcpudp_mt_init(void)
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 67cb98489415..6aa12d0f54e2 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -5,7 +5,7 @@
* based on ipt_time by Fabrice MARIE <fabrice@netfilter.org>
* This is a module which is used for time matching
* It is using some modified code from dietlibc (localtime() function)
- * that you can find at http://www.fefe.de/dietlibc/
+ * that you can find at https://www.fefe.de/dietlibc/
* This file is distributed under the terms of the GNU General Public
* License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
*/
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index 177b40d08098..117d4615d668 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -96,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret ^ data->invert;
}
+static int u32_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_u32 *data = par->matchinfo;
+ const struct xt_u32_test *ct;
+ unsigned int i;
+
+ if (data->ntests > ARRAY_SIZE(data->tests))
+ return -EINVAL;
+
+ for (i = 0; i < data->ntests; ++i) {
+ ct = &data->tests[i];
+
+ if (ct->nnums > ARRAY_SIZE(ct->location) ||
+ ct->nvalues > ARRAY_SIZE(ct->value))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct xt_match xt_u32_mt_reg __read_mostly = {
.name = "u32",
.revision = 0,
.family = NFPROTO_UNSPEC,
.match = u32_mt,
+ .checkentry = u32_mt_checkentry,
.matchsize = sizeof(struct xt_u32),
.me = THIS_MODULE,
};