summaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/Kconfig293
-rw-r--r--net/netfilter/Makefile70
-rw-r--r--net/netfilter/core.c294
-rw-r--r--net/netfilter/ipset/Kconfig3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h46
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c56
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c38
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c61
-rw-r--r--net/netfilter/ipset/ip_set_core.c800
-rw-r--r--net/netfilter/ipset/ip_set_getport.c31
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h859
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c63
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c27
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c41
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c57
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c35
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c51
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c22
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c52
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c91
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c76
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c51
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c70
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c83
-rw-r--r--net/netfilter/ipset/pfxlen.c1
-rw-r--r--net/netfilter/ipvs/Kconfig107
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c36
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c247
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c650
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c998
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c893
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c28
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c23
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c50
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c39
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c216
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c425
-rw-r--r--net/netfilter/nf_bpf_link.c332
-rw-r--r--net/netfilter/nf_conncount.c246
-rw-r--r--net/netfilter/nf_conntrack_acct.c26
-rw-r--r--net/netfilter/nf_conntrack_amanda.c27
-rw-r--r--net/netfilter/nf_conntrack_bpf.c550
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c26
-rw-r--r--net/netfilter/nf_conntrack_core.c1821
-rw-r--r--net/netfilter/nf_conntrack_ecache.c415
-rw-r--r--net/netfilter/nf_conntrack_expect.c106
-rw-r--r--net/netfilter/nf_conntrack_extend.c168
-rw-r--r--net/netfilter/nf_conntrack_ftp.c48
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c21
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c313
-rw-r--r--net/netfilter/nf_conntrack_h323_types.c3
-rw-r--r--net/netfilter/nf_conntrack_helper.c197
-rw-r--r--net/netfilter/nf_conntrack_irc.c70
-rw-r--r--net/netfilter/nf_conntrack_labels.c41
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c11
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1568
-rw-r--r--net/netfilter/nf_conntrack_ovs.c185
-rw-r--r--net/netfilter/nf_conntrack_pptp.c135
-rw-r--r--net/netfilter/nf_conntrack_proto.c743
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c867
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c90
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c212
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c212
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c207
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c434
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c856
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c120
-rw-r--r--net/netfilter/nf_conntrack_sane.c84
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c21
-rw-r--r--net/netfilter/nf_conntrack_sip.c101
-rw-r--r--net/netfilter/nf_conntrack_snmp.c6
-rw-r--r--net/netfilter/nf_conntrack_standalone.c654
-rw-r--r--net/netfilter/nf_conntrack_tftp.c24
-rw-r--r--net/netfilter/nf_conntrack_timeout.c126
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c25
-rw-r--r--net/netfilter/nf_dup_netdev.c67
-rw-r--r--net/netfilter/nf_flow_table_bpf.c121
-rw-r--r--net/netfilter/nf_flow_table_core.c687
-rw-r--r--net/netfilter/nf_flow_table_inet.c74
-rw-r--r--net/netfilter/nf_flow_table_ip.c900
-rw-r--r--net/netfilter/nf_flow_table_offload.c1241
-rw-r--r--net/netfilter/nf_flow_table_path.c330
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_flow_table_xdp.c147
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c123
-rw-r--r--net/netfilter/nf_internals.h26
-rw-r--r--net/netfilter/nf_log.c75
-rw-r--r--net/netfilter/nf_log_common.c215
-rw-r--r--net/netfilter/nf_log_netdev.c81
-rw-r--r--net/netfilter/nf_log_syslog.c1085
-rw-r--r--net/netfilter/nf_nat_amanda.c29
-rw-r--r--net/netfilter/nf_nat_bpf.c77
-rw-r--r--net/netfilter/nf_nat_core.c627
-rw-r--r--net/netfilter/nf_nat_ftp.c31
-rw-r--r--net/netfilter/nf_nat_helper.c55
-rw-r--r--net/netfilter/nf_nat_irc.c31
-rw-r--r--net/netfilter/nf_nat_masquerade.c368
-rw-r--r--net/netfilter/nf_nat_ovs.c136
-rw-r--r--net/netfilter/nf_nat_proto.c948
-rw-r--r--net/netfilter/nf_nat_redirect.c113
-rw-r--r--net/netfilter/nf_nat_sip.c36
-rw-r--r--net/netfilter/nf_nat_tftp.c16
-rw-r--r--net/netfilter/nf_queue.c259
-rw-r--r--net/netfilter/nf_sockopt.c60
-rw-r--r--net/netfilter/nf_synproxy_core.c940
-rw-r--r--net/netfilter/nf_tables_api.c9391
-rw-r--r--net/netfilter/nf_tables_core.c245
-rw-r--r--net/netfilter/nf_tables_offload.c699
-rw-r--r--net/netfilter/nf_tables_set_core.c28
-rw-r--r--net/netfilter/nf_tables_trace.c189
-rw-r--r--net/netfilter/nfnetlink.c303
-rw-r--r--net/netfilter/nfnetlink_acct.c146
-rw-r--r--net/netfilter/nfnetlink_cthelper.c139
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c341
-rw-r--r--net/netfilter/nfnetlink_hook.c486
-rw-r--r--net/netfilter/nfnetlink_log.c195
-rw-r--r--net/netfilter/nfnetlink_osf.c71
-rw-r--r--net/netfilter/nfnetlink_queue.c482
-rw-r--r--net/netfilter/nft_bitwise.c616
-rw-r--r--net/netfilter/nft_byteorder.c72
-rw-r--r--net/netfilter/nft_chain_filter.c197
-rw-r--r--net/netfilter/nft_chain_nat.c149
-rw-r--r--net/netfilter/nft_chain_route.c169
-rw-r--r--net/netfilter/nft_cmp.c286
-rw-r--r--net/netfilter/nft_compat.c396
-rw-r--r--net/netfilter/nft_connlimit.c89
-rw-r--r--net/netfilter/nft_counter.c178
-rw-r--r--net/netfilter/nft_ct.c382
-rw-r--r--net/netfilter/nft_ct_fast.c62
-rw-r--r--net/netfilter/nft_dup_netdev.c34
-rw-r--r--net/netfilter/nft_dynset.c319
-rw-r--r--net/netfilter/nft_exthdr.c522
-rw-r--r--net/netfilter/nft_fib.c92
-rw-r--r--net/netfilter/nft_fib_inet.c8
-rw-r--r--net/netfilter/nft_fib_netdev.c10
-rw-r--r--net/netfilter/nft_flow_offload.c150
-rw-r--r--net/netfilter/nft_fwd_netdev.c67
-rw-r--r--net/netfilter/nft_hash.c178
-rw-r--r--net/netfilter/nft_immediate.c246
-rw-r--r--net/netfilter/nft_inner.c431
-rw-r--r--net/netfilter/nft_last.c138
-rw-r--r--net/netfilter/nft_limit.c225
-rw-r--r--net/netfilter/nft_log.c34
-rw-r--r--net/netfilter/nft_lookup.c216
-rw-r--r--net/netfilter/nft_masq.c248
-rw-r--r--net/netfilter/nft_meta.c721
-rw-r--r--net/netfilter/nft_nat.c242
-rw-r--r--net/netfilter/nft_numgen.c92
-rw-r--r--net/netfilter/nft_objref.c150
-rw-r--r--net/netfilter/nft_osf.c102
-rw-r--r--net/netfilter/nft_payload.c792
-rw-r--r--net/netfilter/nft_queue.c52
-rw-r--r--net/netfilter/nft_quota.c115
-rw-r--r--net/netfilter/nft_range.c39
-rw-r--r--net/netfilter/nft_redir.c227
-rw-r--r--net/netfilter/nft_reject.c29
-rw-r--r--net/netfilter/nft_reject_inet.c80
-rw-r--r--net/netfilter/nft_reject_netdev.c190
-rw-r--r--net/netfilter/nft_rt.c35
-rw-r--r--net/netfilter/nft_set_bitmap.c105
-rw-r--r--net/netfilter/nft_set_hash.c514
-rw-r--r--net/netfilter/nft_set_pipapo.c2400
-rw-r--r--net/netfilter/nft_set_pipapo.h302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h16
-rw-r--r--net/netfilter/nft_set_rbtree.c670
-rw-r--r--net/netfilter/nft_socket.c221
-rw-r--r--net/netfilter/nft_synproxy.c397
-rw-r--r--net/netfilter/nft_tproxy.c78
-rw-r--r--net/netfilter/nft_tunnel.c313
-rw-r--r--net/netfilter/nft_xfrm.c64
-rw-r--r--net/netfilter/utils.c81
-rw-r--r--net/netfilter/x_tables.c312
-rw-r--r--net/netfilter/xt_AUDIT.c7
-rw-r--r--net/netfilter/xt_CHECKSUM.c38
-rw-r--r--net/netfilter/xt_CLASSIFY.c21
-rw-r--r--net/netfilter/xt_CONNSECMARK.c44
-rw-r--r--net/netfilter/xt_CT.c244
-rw-r--r--net/netfilter/xt_DSCP.c21
-rw-r--r--net/netfilter/xt_HL.c9
-rw-r--r--net/netfilter/xt_HMARK.c13
-rw-r--r--net/netfilter/xt_IDLETIMER.c349
-rw-r--r--net/netfilter/xt_LED.c69
-rw-r--r--net/netfilter/xt_LOG.c16
-rw-r--r--net/netfilter/xt_MASQUERADE.c128
-rw-r--r--net/netfilter/xt_NETMAP.c5
-rw-r--r--net/netfilter/xt_NFLOG.c52
-rw-r--r--net/netfilter/xt_NFQUEUE.c6
-rw-r--r--net/netfilter/xt_RATEEST.c58
-rw-r--r--net/netfilter/xt_REDIRECT.c15
-rw-r--r--net/netfilter/xt_SECMARK.c119
-rw-r--r--net/netfilter/xt_TCPMSS.c11
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c37
-rw-r--r--net/netfilter/xt_TEE.c5
-rw-r--r--net/netfilter/xt_TPROXY.c44
-rw-r--r--net/netfilter/xt_TRACE.c38
-rw-r--r--net/netfilter/xt_addrtype.c36
-rw-r--r--net/netfilter/xt_bpf.c7
-rw-r--r--net/netfilter/xt_cgroup.c31
-rw-r--r--net/netfilter/xt_cluster.c38
-rw-r--r--net/netfilter/xt_comment.c1
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c7
-rw-r--r--net/netfilter/xt_connlimit.c76
-rw-r--r--net/netfilter/xt_connmark.c62
-rw-r--r--net/netfilter/xt_conntrack.c5
-rw-r--r--net/netfilter/xt_cpu.c6
-rw-r--r--net/netfilter/xt_dccp.c5
-rw-r--r--net/netfilter/xt_devgroup.c5
-rw-r--r--net/netfilter/xt_dscp.c5
-rw-r--r--net/netfilter/xt_ecn.c5
-rw-r--r--net/netfilter/xt_esp.c5
-rw-r--r--net/netfilter/xt_hashlimit.c103
-rw-r--r--net/netfilter/xt_helper.c5
-rw-r--r--net/netfilter/xt_hl.c5
-rw-r--r--net/netfilter/xt_ipcomp.c6
-rw-r--r--net/netfilter/xt_iprange.c9
-rw-r--r--net/netfilter/xt_ipvs.c1
-rw-r--r--net/netfilter/xt_l2tp.c5
-rw-r--r--net/netfilter/xt_length.c10
-rw-r--r--net/netfilter/xt_limit.c57
-rw-r--r--net/netfilter/xt_mac.c5
-rw-r--r--net/netfilter/xt_mark.c47
-rw-r--r--net/netfilter/xt_multiport.c5
-rw-r--r--net/netfilter/xt_nat.c8
-rw-r--r--net/netfilter/xt_nfacct.c49
-rw-r--r--net/netfilter/xt_osf.c16
-rw-r--r--net/netfilter/xt_owner.c45
-rw-r--r--net/netfilter/xt_physdev.c27
-rw-r--r--net/netfilter/xt_pkttype.c5
-rw-r--r--net/netfilter/xt_policy.c5
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c5
-rw-r--r--net/netfilter/xt_realm.c5
-rw-r--r--net/netfilter/xt_recent.c60
-rw-r--r--net/netfilter/xt_repldata.h2
-rw-r--r--net/netfilter/xt_sctp.c4
-rw-r--r--net/netfilter/xt_set.c51
-rw-r--r--net/netfilter/xt_socket.c26
-rw-r--r--net/netfilter/xt_state.c5
-rw-r--r--net/netfilter/xt_statistic.c7
-rw-r--r--net/netfilter/xt_string.c5
-rw-r--r--net/netfilter/xt_tcpmss.c5
-rw-r--r--net/netfilter/xt_tcpudp.c111
-rw-r--r--net/netfilter/xt_time.c42
-rw-r--r--net/netfilter/xt_u32.c22
267 files changed, 42492 insertions, 15230 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index beb3a69ce1d4..6cdc994fdc8a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "Core Netfilter Configuration"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
config NETFILTER_INGRESS
bool "Netfilter ingress support"
@@ -9,6 +10,17 @@ config NETFILTER_INGRESS
This allows you to classify packets from ingress using the Netfilter
infrastructure.
+config NETFILTER_EGRESS
+ bool "Netfilter egress support"
+ default y
+ select NET_EGRESS
+ help
+ This allows you to classify packets before transmission using the
+ Netfilter infrastructure.
+
+config NETFILTER_SKIP_EGRESS
+ def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB)
+
config NETFILTER_NETLINK
tristate
@@ -18,8 +30,21 @@ config NETFILTER_FAMILY_BRIDGE
config NETFILTER_FAMILY_ARP
bool
+config NETFILTER_BPF_LINK
+ def_bool BPF_SYSCALL
+
+config NETFILTER_NETLINK_HOOK
+ tristate "Netfilter base hook dump support"
+ depends on NETFILTER_ADVANCED
+ depends on NF_TABLES
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ to list the base netfilter hooks via NFNETLINK.
+ This is helpful for debugging.
+
config NETFILTER_NETLINK_ACCT
-tristate "Netfilter NFACCT over NFNETLINK interface"
+ tristate "Netfilter NFACCT over NFNETLINK interface"
depends on NETFILTER_ADVANCED
select NETFILTER_NETLINK
help
@@ -33,7 +58,7 @@ config NETFILTER_NETLINK_QUEUE
help
If this option is enabled, the kernel will include support
for queueing packets via NFNETLINK.
-
+
config NETFILTER_NETLINK_LOG
tristate "Netfilter LOG over NFNETLINK interface"
default m if NETFILTER_ADVANCED=n
@@ -70,12 +95,17 @@ config NF_CONNTRACK
To compile it as a module, choose M here. If unsure, say N.
-config NF_LOG_COMMON
- tristate
-
-config NF_LOG_NETDEV
- tristate "Netdev packet logging"
- select NF_LOG_COMMON
+config NF_LOG_SYSLOG
+ tristate "Syslog packet logging"
+ default m if NETFILTER_ADVANCED=n
+ help
+ This option enable support for packet logging via syslog.
+ It supports IPv4, IPV6, ARP and common transport protocols such
+ as TCP and UDP.
+ This is a simpler but less flexible logging method compared to
+ CONFIG_NETFILTER_NETLINK_LOG.
+ If both are enabled the backend to use can be configured at run-time
+ by means of per-address-family sysctl tunables.
if NF_CONNTRACK
config NETFILTER_CONNCOUNT
@@ -93,7 +123,7 @@ config NF_CONNTRACK_MARK
config NF_CONNTRACK_SECMARK
bool 'Connection tracking security mark support'
depends on NETWORK_SECMARK
- default m if NETFILTER_ADVANCED=n
+ default y if NETFILTER_ADVANCED=n
help
This option enables security markings to be applied to
connections. Typically they are copied to connections from
@@ -117,9 +147,8 @@ config NF_CONNTRACK_ZONES
config NF_CONNTRACK_PROCFS
bool "Supply CT list in procfs (OBSOLETE)"
- default y
depends on PROC_FS
- ---help---
+ help
This option enables for the list of known conntrack entries
to be shown in procfs under net/netfilter/nf_conntrack. This
is considered obsolete in favor of using the conntrack(8)
@@ -163,24 +192,17 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It can be used with xtables connlabel
match and the nftables ct expression.
-config NF_CT_PROTO_DCCP
- bool 'DCCP protocol connection tracking support'
- depends on NETFILTER_ADVANCED
- default y
- help
- With this option enabled, the layer 3 independent connection
- tracking code will be able to do state tracking on DCCP connections.
-
- If unsure, say Y.
+config NF_CONNTRACK_OVS
+ bool
config NF_CT_PROTO_GRE
- tristate
+ bool
config NF_CT_PROTO_SCTP
bool 'SCTP protocol connection tracking support'
depends on NETFILTER_ADVANCED
default y
- select LIBCRC32C
+ select NET_CRC32C
help
With this option enabled, the layer 3 independent connection
tracking code will be able to do state tracking on SCTP connections.
@@ -222,8 +244,6 @@ config NF_CONNTRACK_FTP
of Network Address Translation on them.
This is FTP support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
To compile it as a module, choose M here. If unsure, say N.
@@ -337,7 +357,7 @@ config NF_CONNTRACK_SIP
help
SIP is an application-layer control protocol that can establish,
modify, and terminate multimedia sessions (conferences) such as
- Internet telephony calls. With the ip_conntrack_sip and
+ Internet telephony calls. With the nf_conntrack_sip and
the nf_nat_sip modules you can support the protocol on a connection
tracking/NATing firewall.
@@ -396,12 +416,13 @@ config NETFILTER_NETLINK_GLUE_CT
the enqueued via NFNETLINK.
config NF_NAT
- tristate
-
-config NF_NAT_NEEDED
- bool
- depends on NF_NAT
- default y
+ tristate "Network Address Translation support"
+ depends on NF_CONNTRACK
+ default m if NETFILTER_ADVANCED=n
+ help
+ The NAT option allows masquerading, port forwarding and other
+ forms of full Network Address Port Translation. This can be
+ controlled by iptables, ip6tables or nft.
config NF_NAT_AMANDA
tristate
@@ -431,6 +452,12 @@ config NF_NAT_TFTP
config NF_NAT_REDIRECT
bool
+config NF_NAT_MASQUERADE
+ bool
+
+config NF_NAT_OVS
+ bool
+
config NETFILTER_SYNPROXY
tristate
@@ -438,13 +465,14 @@ endif # NF_CONNTRACK
config NF_TABLES
select NETFILTER_NETLINK
+ select NET_CRC32C
tristate "Netfilter nf_tables support"
help
nftables is the new packet classification framework that intends to
replace the existing {ip,ip6,arp,eb}_tables infrastructure. It
provides a pseudo-state machine with an extensible instruction-set
(also known as expressions) that the userspace 'nft' utility
- (http://www.netfilter.org/projects/nftables) uses to build the
+ (https://www.netfilter.org/projects/nftables) uses to build the
rule-set. It also comes with the generic set infrastructure that
allows you to construct mappings between matchings and actions
for performance lookups.
@@ -452,14 +480,6 @@ config NF_TABLES
To compile it as a module, choose M here.
if NF_TABLES
-
-config NF_TABLES_SET
- tristate "Netfilter nf_tables set infrastructure"
- help
- This option enables the nf_tables set infrastructure that allows to
- look up for elements in a set and to build one-way mappings between
- matchings and actions.
-
config NF_TABLES_INET
depends on IPV6
select NF_TABLES_IPV4
@@ -486,6 +506,12 @@ config NFT_CT
This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
+config NFT_EXTHDR_DCCP
+ bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)"
+ default n
+ help
+ This option adds support for matching on DCCP extension headers.
+
config NFT_FLOW_OFFLOAD
depends on NF_CONNTRACK && NF_FLOW_TABLE
tristate "Netfilter nf_tables hardware flow offload module"
@@ -493,12 +519,6 @@ config NFT_FLOW_OFFLOAD
This option adds the "flow_offload" expression that you can use to
choose what flows are placed into the hardware.
-config NFT_COUNTER
- tristate "Netfilter nf_tables counter module"
- help
- This option adds the "counter" expression that you can use to
- include packet and byte counters in a rule.
-
config NFT_CONNLIMIT
tristate "Netfilter nf_tables connlimit module"
depends on NF_CONNTRACK
@@ -523,6 +543,7 @@ config NFT_LIMIT
config NFT_MASQ
depends on NF_CONNTRACK
depends on NF_NAT
+ select NF_NAT_MASQUERADE
tristate "Netfilter nf_tables masquerade support"
help
This option adds the "masquerade" expression that you can use
@@ -532,6 +553,7 @@ config NFT_REDIR
depends on NF_CONNTRACK
depends on NF_NAT
tristate "Netfilter nf_tables redirect support"
+ select NF_NAT_REDIRECT
help
This options adds the "redirect" expression that you can use
to perform NAT in the redirect flavour.
@@ -539,6 +561,7 @@ config NFT_REDIR
config NFT_NAT
depends on NF_CONNTRACK
select NF_NAT
+ depends on NF_TABLES_IPV4 || NF_TABLES_IPV6
tristate "Netfilter nf_tables nat module"
help
This option adds the "nat" expression that you can use to perform
@@ -550,12 +573,6 @@ config NFT_TUNNEL
This option adds the "tunnel" expression that you can use to set
tunneling policies.
-config NFT_OBJREF
- tristate "Netfilter nf_tables stateful object reference module"
- help
- This option adds the "objref" expression that allows you to refer to
- stateful objects, such as counters and quotas.
-
config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
@@ -643,6 +660,17 @@ config NFT_TPROXY
help
This makes transparent proxy support available in nftables.
+config NFT_SYNPROXY
+ tristate "Netfilter nf_tables SYNPROXY expression support"
+ depends on NF_CONNTRACK && NETFILTER_ADVANCED
+ select NETFILTER_SYNPROXY
+ select SYN_COOKIES
+ help
+ The SYNPROXY expression allows you to intercept TCP connections and
+ establish them using syncookies before they are passed on to the
+ server. This allows to avoid conntrack and server resource usage
+ during SYN-flood attacks.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -672,6 +700,16 @@ config NFT_FIB_NETDEV
The lookup will be delegated to the IPv4 or IPv6 FIB depending
on the protocol of the packet.
+config NFT_REJECT_NETDEV
+ depends on NFT_REJECT_IPV4
+ depends on NFT_REJECT_IPV6
+ tristate "Netfilter nf_tables netdev REJECT support"
+ help
+ This option enables the REJECT support from the netdev table.
+ The return packet generation will be delegated to the IPv4
+ or IPv6 ICMP or TCP RST implementation depending on the
+ protocol of the packet.
+
endif # NF_TABLES_NETDEV
endif # NF_TABLES
@@ -680,7 +718,7 @@ config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
depends on NF_FLOW_TABLE
help
- This option adds the flow table mixed IPv4/IPv6 support.
+ This option adds the flow table mixed IPv4/IPv6 support.
To compile it as a module, choose M here.
@@ -694,6 +732,14 @@ config NF_FLOW_TABLE
To compile it as a module, choose M here.
+config NF_FLOW_TABLE_PROCFS
+ bool "Supply flow table statistics in procfs"
+ depends on NF_FLOW_TABLE
+ depends on PROC_FS
+ help
+ This option enables for the flow table offload statistics
+ to be shown in procfs under net/netfilter/nf_flowtable.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
@@ -703,12 +749,31 @@ config NETFILTER_XTABLES
if NETFILTER_XTABLES
+config NETFILTER_XTABLES_COMPAT
+ bool "Netfilter Xtables 32bit support"
+ depends on COMPAT
+ help
+ This option provides a translation layer to run 32bit arp,ip(6),ebtables
+ binaries on 64bit kernels.
+
+ If unsure, say N.
+
+config NETFILTER_XTABLES_LEGACY
+ bool "Netfilter legacy tables support"
+ depends on !PREEMPT_RT
+ help
+ Say Y here if you still require support for legacy tables. This is
+ required by the legacy tools (iptables-legacy) and is not needed if
+ you use iptables over nftables (iptables-nft).
+ Legacy support is not limited to IP, it also includes EBTABLES and
+ ARPTABLES.
+
comment "Xtables combined modules"
config NETFILTER_XT_MARK
tristate 'nfmark target and match support'
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option adds the "MARK" target and "mark" match.
Netfilter mark matching allows you to match packets based on the
@@ -724,7 +789,7 @@ config NETFILTER_XT_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
- ---help---
+ help
This option adds the "CONNMARK" target and "connmark" match.
Netfilter allows you to store a mark value per connection (a.k.a.
@@ -751,7 +816,7 @@ config NETFILTER_XT_TARGET_AUDIT
tristate "AUDIT target support"
depends on AUDIT
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a 'AUDIT' target, which can be used to create
audit records for packets dropped/accepted.
@@ -759,9 +824,9 @@ config NETFILTER_XT_TARGET_AUDIT
config NETFILTER_XT_TARGET_CHECKSUM
tristate "CHECKSUM target support"
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a `CHECKSUM' target, which can be used in the iptables mangle
table to work around buggy DHCP clients in virtualized environments.
@@ -781,7 +846,7 @@ config NETFILTER_XT_TARGET_CLASSIFY
the priority of a packet. Some qdiscs can use this value for
classification, among these are:
- atm, cbq, dsmark, pfifo_fast, htb, prio
+ atm, cbq, dsmark, pfifo_fast, htb, prio
To compile it as a module, choose M here. If unsure, say N.
@@ -790,7 +855,7 @@ config NETFILTER_XT_TARGET_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_CONNMARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
@@ -810,7 +875,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK
config NETFILTER_XT_TARGET_CT
tristate '"CT" target support'
depends on NF_CONNTRACK
- depends on IP_NF_RAW || IP6_NF_RAW
+ depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This options adds a `CT' target, which allows to specify initial
@@ -821,7 +886,7 @@ config NETFILTER_XT_TARGET_CT
config NETFILTER_XT_TARGET_DSCP
tristate '"DSCP" and "TOS" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a `DSCP' target, which allows you to manipulate
@@ -837,9 +902,9 @@ config NETFILTER_XT_TARGET_DSCP
config NETFILTER_XT_TARGET_HL
tristate '"HL" hoplimit target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds the "HL" (for IPv6) and "TTL" (for IPv4)
targets, which enable the user to change the
hoplimit/time-to-live value of the IP header.
@@ -854,7 +919,7 @@ config NETFILTER_XT_TARGET_HMARK
tristate '"HMARK" target support'
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds the "HMARK" target.
The target allows you to create rules in the "raw" and "mangle" tables
@@ -898,12 +963,11 @@ config NETFILTER_XT_TARGET_LED
echo netfilter-ssh > /sys/class/leds/<ledname>/trigger
For more information on the LEDs available on your system, see
- Documentation/leds/leds-class.txt
+ Documentation/leds/leds-class.rst
config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
- select NF_LOG_COMMON
- select NF_LOG_IPV4
+ select NF_LOG_SYSLOG
select NF_LOG_IPV6 if IP6_NF_IPTABLES
default m if NETFILTER_ADVANCED=n
help
@@ -916,7 +980,7 @@ config NETFILTER_XT_TARGET_MARK
tristate '"MARK" target support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
@@ -924,7 +988,7 @@ config NETFILTER_XT_TARGET_MARK
config NETFILTER_XT_NAT
tristate '"SNAT and DNAT" targets support'
depends on NF_NAT
- ---help---
+ help
This option enables the SNAT and DNAT targets.
To compile it as a module, choose M here. If unsure, say N.
@@ -932,7 +996,7 @@ config NETFILTER_XT_NAT
config NETFILTER_XT_TARGET_NETMAP
tristate '"NETMAP" target support'
depends on NF_NAT
- ---help---
+ help
NETMAP is an implementation of static 1:1 NAT mapping of network
addresses. It maps the network address part, while keeping the host
address part intact.
@@ -982,7 +1046,7 @@ config NETFILTER_XT_TARGET_REDIRECT
tristate "REDIRECT target support"
depends on NF_NAT
select NF_NAT_REDIRECT
- ---help---
+ help
REDIRECT is a special case of NAT: all incoming connections are
mapped onto the incoming interface's address, causing the packets to
come to the local machine instead of passing through. This is
@@ -990,14 +1054,29 @@ config NETFILTER_XT_TARGET_REDIRECT
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_TARGET_MASQUERADE
+ tristate "MASQUERADE target support"
+ depends on NF_NAT
+ default m if NETFILTER_ADVANCED=n
+ select NF_NAT_MASQUERADE
+ help
+ Masquerading is a special case of NAT: all outgoing connections are
+ changed to seem to come from a particular interface's address, and
+ if the interface goes down, those connections are lost. This is
+ only useful for dialup accounts with dynamic IP address (ie. your IP
+ address will be different on next dialup).
+
+ To compile it as a module, choose M here. If unsure, say N.
+
config NETFILTER_XT_TARGET_TEE
tristate '"TEE" - packet cloning to alternate destination'
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
+ depends on IP6_NF_IPTABLES || !IP6_NF_IPTABLES
select NF_DUP_IPV4
select NF_DUP_IPV6 if IP6_NF_IPTABLES
- ---help---
+ help
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -1007,7 +1086,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
- depends on IP_NF_MANGLE
+ depends on IP_NF_MANGLE || NFT_COMPAT
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
select NF_TPROXY_IPV4
@@ -1019,7 +1098,7 @@ config NETFILTER_XT_TARGET_TPROXY
on Netfilter connection tracking and NAT, unlike REDIRECT.
For it to work you will have to configure certain iptables rules
and use policy routing. For more information on how to set it up
- see Documentation/networking/tproxy.txt.
+ see Documentation/networking/tproxy.rst.
To compile it as a module, choose M here. If unsure, say N.
@@ -1033,7 +1112,7 @@ config NETFILTER_XT_TARGET_TRACE
the tables, chains, rules.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_TARGET_SECMARK
tristate '"SECMARK" target support'
@@ -1049,7 +1128,7 @@ config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option adds a `TCPMSS' target, which allows you to alter the
MSS value of TCP SYN packets, to control the maximum size for that
connection (usually limiting it to your outgoing interface's MTU
@@ -1074,7 +1153,7 @@ config NETFILTER_XT_TARGET_TCPMSS
config NETFILTER_XT_TARGET_TCPOPTSTRIP
tristate '"TCPOPTSTRIP" target support'
- depends on IP_NF_MANGLE || IP6_NF_MANGLE
+ depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
help
This option adds a "TCPOPTSTRIP" target, which allows you to strip
@@ -1087,12 +1166,12 @@ comment "Xtables matches"
config NETFILTER_XT_MATCH_ADDRTYPE
tristate '"addrtype" address type match support'
default m if NETFILTER_ADVANCED=n
- ---help---
+ help
This option allows you to match what routing thinks of an address,
eg. UNICAST, LOCAL, BROADCAST, ...
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_BPF
tristate '"bpf" match support'
@@ -1107,8 +1186,8 @@ config NETFILTER_XT_MATCH_CGROUP
tristate '"control group" match support'
depends on NETFILTER_ADVANCED
depends on CGROUPS
- select CGROUP_NET_CLASSID
- ---help---
+ select SOCK_CGROUP_DATA
+ help
Socket/process control group matching allows you to match locally
generated packets based on which net_cls control group processes
belong to.
@@ -1117,7 +1196,7 @@ config NETFILTER_XT_MATCH_CLUSTER
tristate '"cluster" match support'
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option allows you to build work-load-sharing clusters of
network servers/stateful firewalls without having a dedicated
load-balancing router/server/switch. Basically, this match returns
@@ -1137,7 +1216,7 @@ config NETFILTER_XT_MATCH_COMMENT
comments in your iptables ruleset.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_CONNBYTES
tristate '"connbytes" per-connection counter match support'
@@ -1148,14 +1227,14 @@ config NETFILTER_XT_MATCH_CONNBYTES
number of bytes and/or packets for each direction within a connection.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_CONNLABEL
tristate '"connlabel" match support'
select NF_CONNTRACK_LABELS
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
- ---help---
+ help
This match allows you to test and assign userspace-defined labels names
to a connection. The kernel only stores bit values - mapping
names to bits is done by userspace.
@@ -1168,7 +1247,7 @@ config NETFILTER_XT_MATCH_CONNLIMIT
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_CONNCOUNT
- ---help---
+ help
This match allows you to match against the number of parallel
connections to a server per client IP address (or address block).
@@ -1177,7 +1256,7 @@ config NETFILTER_XT_MATCH_CONNMARK
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_CONNMARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module).
@@ -1205,16 +1284,16 @@ config NETFILTER_XT_MATCH_CPU
To compile it as a module, choose M here. If unsure, say N.
config NETFILTER_XT_MATCH_DCCP
- tristate '"dccp" protocol match support'
+ tristate '"dccp" protocol match support (DEPRECATED)'
depends on NETFILTER_ADVANCED
- default IP_DCCP
+ default n
help
With this option enabled, you will be able to use the iptables
`dccp' match in order to match on DCCP source/destination ports
and DCCP flags.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_DEVGROUP
tristate '"devgroup" match support'
@@ -1243,7 +1322,7 @@ config NETFILTER_XT_MATCH_DSCP
config NETFILTER_XT_MATCH_ECN
tristate '"ecn" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds an "ECN" match, which allows you to match against
the IPv4 and TCP header ECN fields.
@@ -1279,14 +1358,14 @@ config NETFILTER_XT_MATCH_HELPER
depends on NETFILTER_ADVANCED
help
Helper matching allows you to match packets in dynamic connections
- tracked by a conntrack-helper, ie. ip_conntrack_ftp
+ tracked by a conntrack-helper, ie. nf_conntrack_ftp
To compile it as a module, choose M here. If unsure, say Y.
config NETFILTER_XT_MATCH_HL
tristate '"hl" hoplimit/TTL match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
HL matching allows you to match packets based on the hoplimit
in the IPv6 header, or the time-to-live field in the IPv4
header of the packet.
@@ -1303,7 +1382,7 @@ config NETFILTER_XT_MATCH_IPCOMP
config NETFILTER_XT_MATCH_IPRANGE
tristate '"iprange" address range match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a "iprange" match, which allows you to match based on
an IP address range. (Normal iptables only matches on single addresses
with an optional mask.)
@@ -1324,7 +1403,7 @@ config NETFILTER_XT_MATCH_L2TP
tristate '"l2tp" match support'
depends on NETFILTER_ADVANCED
default L2TP
- ---help---
+ help
This option adds an "L2TP" match, which allows you to match against
L2TP protocol header fields.
@@ -1362,7 +1441,7 @@ config NETFILTER_XT_MATCH_MARK
tristate '"mark" match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MARK
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MARK (combined mark/MARK module).
@@ -1404,7 +1483,7 @@ config NETFILTER_XT_MATCH_OSF
config NETFILTER_XT_MATCH_OWNER
tristate '"owner" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
Socket owner matching allows you to match locally-generated packets
based on who created the socket: the user or group. It is also
possible to check whether a socket actually exists.
@@ -1450,7 +1529,7 @@ config NETFILTER_XT_MATCH_QUOTA
byte counter.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_RATEEST
tristate '"rateest" match support'
@@ -1470,16 +1549,16 @@ config NETFILTER_XT_MATCH_REALM
This option adds a `realm' match, which allows you to use the realm
key from the routing subsystem inside iptables.
- This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
+ This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option
in tc world.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_RECENT
tristate '"recent" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This match is used for creating one or many lists of recently
used addresses and then matching against that/those list(s).
@@ -1491,12 +1570,12 @@ config NETFILTER_XT_MATCH_SCTP
depends on NETFILTER_ADVANCED
default IP_SCTP
help
- With this option enabled, you will be able to use the
+ With this option enabled, you will be able to use the
`sctp' match in order to match on SCTP source/destination ports
and SCTP chunk types.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config NETFILTER_XT_MATCH_SOCKET
tristate '"socket" match support'
@@ -1562,7 +1641,7 @@ config NETFILTER_XT_MATCH_TCPMSS
config NETFILTER_XT_MATCH_TIME
tristate '"time" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a "time" match, which allows you to match based on
the packet arrival time (at the machine which netfilter is running)
on) or departure time/date (for locally generated packets).
@@ -1576,7 +1655,7 @@ config NETFILTER_XT_MATCH_TIME
config NETFILTER_XT_MATCH_U32
tristate '"u32" match support'
depends on NETFILTER_ADVANCED
- ---help---
+ help
u32 allows you to extract quantities of up to 4 bytes from a packet,
AND them with specified masks, shift them by specified amounts and
test whether the results are in any of a set of specified ranges.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1ae65a314d7a..6bfc250e474f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -11,22 +11,28 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
+ifeq ($(CONFIG_NF_CONNTRACK),m)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_conntrack_bpf.o
+else ifeq ($(CONFIG_NF_CONNTRACK),y)
+nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
+endif
obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
+obj-$(CONFIG_NETFILTER_NETLINK_HOOK) += nfnetlink_hook.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
-obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
-
# netlink interface for nf_conntrack
obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o
obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o
@@ -49,14 +55,18 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o
-# generic transport layer logging
-obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
-
-# packet logging for netdev family
-obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
+obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
obj-$(CONFIG_NF_NAT) += nf_nat.o
nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
+nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o
+
+ifeq ($(CONFIG_NF_NAT),m)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
+else ifeq ($(CONFIG_NF_NAT),y)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o
+endif
# NAT helpers
obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
@@ -77,13 +87,25 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
- nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
-
-nf_tables_set-objs := nf_tables_set_core.o \
- nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
+ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
+ nft_counter.o nft_objref.o nft_inner.o \
+ nft_chain_route.o nf_tables_offload.o \
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
+ nft_set_pipapo.o
+
+ifdef CONFIG_X86_64
+ifndef CONFIG_UML
+nf_tables-objs += nft_set_pipapo_avx2.o
+endif
+endif
+
+ifdef CONFIG_NFT_CT
+ifdef CONFIG_MITIGATION_RETPOLINE
+nf_tables-objs += nft_ct_fast.o
+endif
+endif
obj-$(CONFIG_NF_TABLES) += nf_tables.o
-obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
@@ -91,13 +113,12 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o
obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
-obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
+obj-$(CONFIG_NFT_REJECT_NETDEV) += nft_reject_netdev.o
obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
-obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
@@ -109,6 +130,9 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
obj-$(CONFIG_NFT_OSF) += nft_osf.o
obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o
+obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o
+
+obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
@@ -116,11 +140,19 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \
+ nf_flow_table_path.o \
+ nf_flow_table_offload.o nf_flow_table_xdp.o
+nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o
+ifeq ($(CONFIG_NF_FLOW_TABLE),m)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_flow_table_bpf.o
+else ifeq ($(CONFIG_NF_FLOW_TABLE),y)
+nf_flow_table-$(CONFIG_DEBUG_INFO_BTF) += nf_flow_table_bpf.o
+endif
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
-# generic X tables
+# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
# combos
@@ -145,6 +177,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o
obj-$(CONFIG_NETFILTER_XT_TARGET_REDIRECT) += xt_REDIRECT.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_MASQUERADE) += xt_MASQUERADE.o
obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
@@ -204,3 +237,6 @@ obj-$(CONFIG_IP_SET) += ipset/
# IPVS
obj-$(CONFIG_IP_VS) += ipvs/
+
+# lwtunnel
+obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 93aaec3a54ec..11a702065bab 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
+#include <net/netfilter/nf_queue.h>
#include <net/sock.h>
#include "nf_internals.h"
@@ -30,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
-DEFINE_PER_CPU(bool, nf_skb_duplicated);
-EXPORT_SYMBOL_GPL(nf_skb_duplicated);
-
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
@@ -57,7 +55,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
if (num == 0)
return NULL;
- e = kvzalloc(alloc, GFP_KERNEL);
+ e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
if (e)
e->num_hook_entries = num;
return e;
@@ -118,6 +116,18 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
for (i = 0; i < old_entries; i++) {
if (orig_ops[i] != &dummy_ops)
alloc_entries++;
+
+ /* Restrict BPF hook type to force a unique priority, not
+ * shared at attach time.
+ *
+ * This is mainly to avoid ordering issues between two
+ * different bpf programs, this doesn't prevent a normal
+ * hook at same priority as a bpf one (we don't want to
+ * prevent defrag, conntrack, iptables etc from attaching).
+ */
+ if (reg->priority == orig_ops[i]->priority &&
+ reg->hook_ops_type == NF_HOOK_OP_BPF)
+ return ERR_PTR(-EBUSY);
}
}
@@ -162,7 +172,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
static void hooks_validate(const struct nf_hook_entries *hooks)
{
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
struct nf_hook_ops **orig_ops;
int prio = INT_MIN;
size_t i = 0;
@@ -281,6 +291,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return NULL;
return net->nf.hooks_bridge + hooknum;
#endif
+#ifdef CONFIG_NETFILTER_INGRESS
+ case NFPROTO_INET:
+ if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS))
+ return NULL;
+ if (!dev || dev_net(dev) != net) {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+ return &dev->nf_hooks_ingress;
+#endif
case NFPROTO_IPV4:
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum))
return NULL;
@@ -289,12 +309,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum))
return NULL;
return net->nf.hooks_ipv6 + hooknum;
-#if IS_ENABLED(CONFIG_DECNET)
- case NFPROTO_DECNET:
- if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_decnet) <= hooknum))
- return NULL;
- return net->nf.hooks_decnet + hooknum;
-#endif
default:
WARN_ON_ONCE(1);
return NULL;
@@ -306,24 +320,106 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
return &dev->nf_hooks_ingress;
}
#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hooknum == NF_NETDEV_EGRESS) {
+ if (dev && dev_net(dev) == net)
+ return &dev->nf_hooks_egress;
+ }
+#endif
WARN_ON_ONCE(1);
return NULL;
}
+static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg,
+ int hooknum)
+{
+#ifndef CONFIG_NETFILTER_INGRESS
+ if (reg->hooknum == hooknum)
+ return -EOPNOTSUPP;
+#endif
+ if (reg->hooknum != hooknum ||
+ !reg->dev || dev_net(reg->dev) != net)
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ||
+ (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS))
+ return true;
+
+ return false;
+}
+
+static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg,
+ int pf)
+{
+ return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS;
+}
+
+static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_inc(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
+static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf)
+{
+#ifdef CONFIG_JUMP_LABEL
+ int hooknum;
+
+ if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) {
+ pf = NFPROTO_NETDEV;
+ hooknum = NF_NETDEV_INGRESS;
+ } else {
+ hooknum = reg->hooknum;
+ }
+ static_key_slow_dec(&nf_hooks_needed[pf][hooknum]);
+#endif
+}
+
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
+ int err;
- if (pf == NFPROTO_NETDEV) {
+ switch (pf) {
+ case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
if (reg->hooknum == NF_NETDEV_INGRESS)
return -EOPNOTSUPP;
#endif
- if (reg->hooknum != NF_NETDEV_INGRESS ||
+#ifndef CONFIG_NETFILTER_EGRESS
+ if (reg->hooknum == NF_NETDEV_EGRESS)
+ return -EOPNOTSUPP;
+#endif
+ if ((reg->hooknum != NF_NETDEV_INGRESS &&
+ reg->hooknum != NF_NETDEV_EGRESS) ||
!reg->dev || dev_net(reg->dev) != net)
return -EINVAL;
+ break;
+ case NFPROTO_INET:
+ if (reg->hooknum != NF_INET_INGRESS)
+ break;
+
+ err = nf_ingress_check(net, reg, NF_INET_INGRESS);
+ if (err < 0)
+ return err;
+ break;
}
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
@@ -335,21 +431,25 @@ static int __nf_register_net_hook(struct net *net, int pf,
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);
- if (!IS_ERR(new_hooks))
+ if (!IS_ERR(new_hooks)) {
+ hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
+ }
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);
- hooks_validate(new_hooks);
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_inc_egress_queue();
#endif
+ nf_static_key_inc(reg, pf);
+
BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
@@ -375,7 +475,7 @@ static bool nf_remove_net_hook(struct nf_hook_entries *old,
if (orig_ops[i] != unreg)
continue;
WRITE_ONCE(old->hooks[i].hook, accept_all);
- WRITE_ONCE(orig_ops[i], &dummy_ops);
+ WRITE_ONCE(orig_ops[i], (void *)&dummy_ops);
return true;
}
@@ -402,12 +502,14 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
if (nf_remove_net_hook(p, reg)) {
#ifdef CONFIG_NETFILTER_INGRESS
- if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
+ if (nf_ingress_hook(reg, pf))
net_dec_ingress_queue();
#endif
-#ifdef CONFIG_JUMP_LABEL
- static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (nf_egress_hook(reg, pf))
+ net_dec_egress_queue();
#endif
+ nf_static_key_dec(reg, pf);
} else {
WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum);
}
@@ -424,8 +526,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
if (reg->pf == NFPROTO_INET) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ if (reg->hooknum == NF_INET_INGRESS) {
+ __nf_unregister_net_hook(net, NFPROTO_INET, reg);
+ } else {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ __nf_unregister_net_hook(net, NFPROTO_IPV6, reg);
+ }
} else {
__nf_unregister_net_hook(net, reg->pf, reg);
}
@@ -450,14 +556,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
int err;
if (reg->pf == NFPROTO_INET) {
- err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
- if (err < 0)
- return err;
-
- err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
- if (err < 0) {
- __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
- return err;
+ if (reg->hooknum == NF_INET_INGRESS) {
+ err = __nf_register_net_hook(net, NFPROTO_INET, reg);
+ if (err < 0)
+ return err;
+ } else {
+ err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
+ if (err < 0)
+ return err;
+
+ err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
+ if (err < 0) {
+ __nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
+ return err;
+ }
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
@@ -513,20 +625,21 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
case NF_ACCEPT:
break;
case NF_DROP:
- kfree_skb(skb);
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_NETFILTER_DROP);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
- ret = nf_queue(skb, state, e, s, verdict);
+ ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
+ case NF_STOLEN:
+ return NF_DROP_GETERR(verdict);
default:
- /* Implicit handling for NF_STOLEN, as well as any other
- * non conventional verdicts.
- */
+ WARN_ON_ONCE(1);
return 0;
}
}
@@ -535,57 +648,59 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
}
EXPORT_SYMBOL(nf_hook_slow);
-
-int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
+void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
+ const struct nf_hook_entries *e)
{
- if (writable_len > skb->len)
- return 0;
-
- /* Not exclusive use of packet? Must copy. */
- if (!skb_cloned(skb)) {
- if (writable_len <= skb_headlen(skb))
- return 1;
- } else if (skb_clone_writable(skb, writable_len))
- return 1;
-
- if (writable_len <= skb_headlen(skb))
- writable_len = 0;
- else
- writable_len -= skb_headlen(skb);
+ struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
+ int ret;
- return !!__pskb_pull_tail(skb, writable_len);
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
+ ret = nf_hook_slow(skb, state, e, 0);
+ if (ret == 1)
+ list_add_tail(&skb->list, &sublist);
+ }
+ /* Put passed packets back on main list */
+ list_splice(&sublist, head);
}
-EXPORT_SYMBOL(skb_make_writable);
+EXPORT_SYMBOL(nf_hook_slow_list);
/* This needs to be compiled in any case to avoid dependencies between the
* nfnetlink_queue code and nf_conntrack.
*/
-struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nfnl_ct_hook);
-struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_hook);
+const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v4_hook);
+
+const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_defrag_v6_hook);
+
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-/* This does not belong here, but locally generated errors need it if connection
- tracking in use: without this, connection may not be in hash table, and hence
- manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
- __rcu __read_mostly;
-EXPORT_SYMBOL(ip_ct_attach);
+u8 nf_ctnetlink_has_listener;
+EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener);
-struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_hook);
+/* This does not belong here, but locally generated errors need it if connection
+ * tracking in use: without this, connection may not be in hash table, and hence
+ * manufactured ICMP or RST packets will not be associated with it.
+ */
void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
{
- void (*attach)(struct sk_buff *, const struct sk_buff *);
+ const struct nf_ct_hook *ct_hook;
if (skb->_nfct) {
rcu_read_lock();
- attach = rcu_dereference(ip_ct_attach);
- if (attach)
- attach(new, skb);
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->attach(new, skb);
rcu_read_unlock();
}
}
@@ -593,20 +708,38 @@ EXPORT_SYMBOL(nf_ct_attach);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- BUG_ON(ct_hook == NULL);
- ct_hook->destroy(nfct);
+ if (ct_hook)
+ ct_hook->destroy(nfct);
rcu_read_unlock();
+
+ WARN_ON(!ct_hook);
}
EXPORT_SYMBOL(nf_conntrack_destroy);
+void nf_ct_set_closing(struct nf_conntrack *nfct)
+{
+ const struct nf_ct_hook *ct_hook;
+
+ if (!nfct)
+ return;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (ct_hook)
+ ct_hook->set_closing(nfct);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_ct_set_closing);
+
bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
const struct sk_buff *skb)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_ct_hook *ct_hook;
bool ret = false;
rcu_read_lock();
@@ -645,10 +778,6 @@ static int __net_init netfilter_net_init(struct net *net)
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
#endif
-#if IS_ENABLED(CONFIG_DECNET)
- __netfilter_net_init(net->nf.hooks_decnet, ARRAY_SIZE(net->nf.hooks_decnet));
-#endif
-
#ifdef CONFIG_PROC_FS
net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
net->proc_net);
@@ -681,12 +810,21 @@ int __init netfilter_init(void)
if (ret < 0)
goto err;
+#ifdef CONFIG_LWTUNNEL
+ ret = netfilter_lwtunnel_init();
+ if (ret < 0)
+ goto err_lwtunnel_pernet;
+#endif
ret = netfilter_log_init();
if (ret < 0)
- goto err_pernet;
+ goto err_log_pernet;
return 0;
-err_pernet:
+err_log_pernet:
+#ifdef CONFIG_LWTUNNEL
+ netfilter_lwtunnel_fini();
+err_lwtunnel_pernet:
+#endif
unregister_pernet_subsys(&netfilter_net_ops);
err:
return ret;
diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig
index 4083a8051f0f..b1ea054bb82c 100644
--- a/net/netfilter/ipset/Kconfig
+++ b/net/netfilter/ipset/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menuconfig IP_SET
tristate "IP set support"
depends on INET && NETFILTER
@@ -29,7 +30,7 @@ config IP_SET_BITMAP_IP
depends on IP_SET
help
This option adds the bitmap:ip set type support, by which one
- can store IPv4 addresses (or network addresse) from a range.
+ can store IPv4 addresses (or network addresses) from a range.
To compile it as a module, choose M here. If unsure, say N.
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 257ca393e6f2..798c7993635e 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -1,13 +1,11 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef __IP_SET_BITMAP_IP_GEN_H
#define __IP_SET_BITMAP_IP_GEN_H
+#include <linux/rcupdate_wait.h>
+
#define mtype_do_test IPSET_TOKEN(MTYPE, _do_test)
#define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test)
#define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled)
@@ -32,6 +30,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
@@ -61,12 +60,9 @@ mtype_destroy(struct ip_set *set)
{
struct mtype *map = set->data;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
-
- ip_set_free(map->members);
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
+ ip_set_free(map->members);
ip_set_free(map);
set->data = NULL;
@@ -79,7 +75,7 @@ mtype_flush(struct ip_set *set)
if (set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
- memset(map->members, 0, map->memsize);
+ bitmap_zero(map->members, map->elements);
set->elements = 0;
set->ext_size = 0;
}
@@ -99,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
struct nlattr *nested;
size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
@@ -109,7 +105,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
return 0;
nla_put_failure:
@@ -196,7 +192,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
#ifndef IP_SET_BITMAP_STORED_TIMEOUT
-static inline bool
+static bool
mtype_is_filled(const struct mtype_elem *x)
{
return true;
@@ -213,7 +209,7 @@ mtype_list(const struct ip_set *set,
u32 id, first = cb->args[IPSET_CB_ARG0];
int ret = 0;
- adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ adt = nla_nest_start(skb, IPSET_ATTR_ADT);
if (!adt)
return -EMSGSIZE;
/* Extensions may be replaced */
@@ -230,7 +226,7 @@ mtype_list(const struct ip_set *set,
#endif
ip_set_timeout_expired(ext_timeout(x, set))))
continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
if (id == first) {
nla_nest_cancel(skb, adt);
@@ -244,9 +240,9 @@ mtype_list(const struct ip_set *set,
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x)))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
}
- ipset_nest_end(skb, adt);
+ nla_nest_end(skb, adt);
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
@@ -259,7 +255,7 @@ nla_put_failure:
cb->args[IPSET_CB_ARG0] = 0;
ret = -EMSGSIZE;
}
- ipset_nest_end(skb, adt);
+ nla_nest_end(skb, adt);
out:
rcu_read_unlock();
return ret;
@@ -268,7 +264,7 @@ out:
static void
mtype_gc(struct timer_list *t)
{
- struct mtype *map = from_timer(map, t, gc);
+ struct mtype *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
void *x;
u32 id;
@@ -292,6 +288,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_delete_sync(&map->gc);
+}
+
static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt,
.uadt = mtype_uadt,
@@ -305,6 +310,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head,
.list = mtype_list,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
};
#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 488d6d05c65c..5988b9bb9029 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module implementing an IP set type: the bitmap:ip type */
@@ -31,7 +28,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
@@ -40,7 +37,7 @@ MODULE_ALIAS("ip_set_bitmap:ip");
/* Type structure */
struct bitmap_ip {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -49,7 +46,7 @@ struct bitmap_ip {
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
@@ -58,7 +55,7 @@ struct bitmap_ip_adt_elem {
u16 id;
};
-static inline u32
+static u32
ip_to_id(const struct bitmap_ip *m, u32 ip)
{
return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts;
@@ -66,33 +63,33 @@ ip_to_id(const struct bitmap_ip *m, u32 ip)
/* Common functions */
-static inline int
+static int
bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e,
struct bitmap_ip *map, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map, size_t dsize)
{
return !!test_bit(id, map->members);
}
-static inline int
+static int
bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
u32 flags, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
size_t dsize)
{
@@ -100,7 +97,7 @@ bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id,
htonl(map->first_ip + id * map->hosts));
}
-static inline int
+static int
bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
{
return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
@@ -166,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to) {
+ if (ip > ip_to)
swap(ip, ip_to);
- if (ip < map->first_ip)
- return -IPSET_ERR_BITMAP_RANGE;
- }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -181,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
ip_to = ip;
}
- if (ip_to > map->last_ip)
+ if (ip < map->first_ip || ip_to > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
for (; !before(ip_to, ip); ip += map->hosts) {
@@ -223,7 +217,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
u32 first_ip, u32 last_ip,
u32 elements, u32 hosts, u8 netmask)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_ip = first_ip;
@@ -240,6 +234,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
return true;
}
+static u32
+range_to_mask(u32 from, u32 to, u8 *bits)
+{
+ u32 mask = 0xFFFFFFFE;
+
+ *bits = 32;
+ while (--(*bits) > 0 && mask && (to & mask) != from)
+ mask <<= 1;
+
+ return mask;
+}
+
static int
bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
u32 flags)
@@ -299,8 +305,8 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask);
- hosts = 2 << (32 - netmask - 1);
- elements = 2 << (netmask - mask_bits - 1);
+ hosts = 2U << (32 - netmask - 1);
+ elements = 2UL << (netmask - mask_bits - 1);
}
if (elements > IPSET_BITMAP_MAX_RANGE + 1)
return -IPSET_ERR_BITMAP_RANGE_SIZE;
@@ -313,11 +319,11 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- map->memsize = bitmap_bytes(0, elements - 1);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ip;
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 980000fc3b50..2c625e0f49ec 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* Kernel module implementing an IP set type: the bitmap:ip,mac type */
@@ -31,7 +27,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
@@ -46,14 +42,14 @@ enum {
/* Type structure */
struct bitmap_ipmac {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* MAC + data extensions */
+ unsigned char extensions[] /* MAC + data extensions */
__aligned(__alignof__(u64));
};
@@ -69,7 +65,7 @@ struct bitmap_ipmac_elem {
unsigned char filled;
} __aligned(__alignof__(u64));
-static inline u32
+static u32
ip_to_id(const struct bitmap_ipmac *m, u32 ip)
{
return ip - m->first_ip;
@@ -83,7 +79,7 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip)
/* Common functions */
-static inline int
+static int
bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
const struct bitmap_ipmac *map, size_t dsize)
{
@@ -98,7 +94,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
return -EAGAIN;
}
-static inline int
+static int
bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
{
const struct bitmap_ipmac_elem *elem;
@@ -110,13 +106,13 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
return elem->filled == MAC_FILLED;
}
-static inline int
+static int
bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
{
return elem->filled == MAC_FILLED;
}
-static inline int
+static int
bitmap_ipmac_add_timeout(unsigned long *timeout,
const struct bitmap_ipmac_adt_elem *e,
const struct ip_set_ext *ext, struct ip_set *set,
@@ -143,7 +139,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
return 0;
}
-static inline int
+static int
bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac *map, u32 flags, size_t dsize)
{
@@ -181,14 +177,14 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
return IPSET_ADD_STORE_PLAIN_TIMEOUT;
}
-static inline int
+static int
bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
struct bitmap_ipmac *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
u32 id, size_t dsize)
{
@@ -201,7 +197,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
}
-static inline int
+static int
bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
{
return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
@@ -230,7 +226,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
e.id = ip_to_id(map, ip);
- if (opt->flags & IPSET_DIM_ONE_SRC)
+ if (opt->flags & IPSET_DIM_TWO_SRC)
ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
else
ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
@@ -303,7 +299,7 @@ static bool
init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
u32 first_ip, u32 last_ip, u32 elements)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_ip = first_ip;
@@ -364,10 +360,10 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- map->memsize = bitmap_bytes(0, elements - 1);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_ipmac;
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b561ca8b3659..7138e080def4 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the bitmap:port type */
@@ -26,7 +22,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_bitmap:port");
@@ -34,14 +30,14 @@ MODULE_ALIAS("ip_set_bitmap:port");
/* Type structure */
struct bitmap_port {
- void *members; /* the set members */
+ unsigned long *members; /* the set members */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
@@ -50,7 +46,7 @@ struct bitmap_port_adt_elem {
u16 id;
};
-static inline u16
+static u16
port_to_id(const struct bitmap_port *m, u16 port)
{
return port - m->first_port;
@@ -58,34 +54,34 @@ port_to_id(const struct bitmap_port *m, u16 port)
/* Common functions */
-static inline int
+static int
bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
const struct bitmap_port *map, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_gc_test(u16 id, const struct bitmap_port *map, size_t dsize)
{
return !!test_bit(id, map->members);
}
-static inline int
+static int
bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map, u32 flags, size_t dsize)
{
return !!test_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
struct bitmap_port *map)
{
return !test_and_clear_bit(e->id, map->members);
}
-static inline int
+static int
bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
size_t dsize)
{
@@ -93,13 +89,40 @@ bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id,
htons(map->first_port + id));
}
-static inline int
+static int
bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
{
return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
}
+static bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+ bool ret;
+ u8 proto;
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ ret = ip_set_get_ip4_port(skb, src, port, &proto);
+ break;
+ case NFPROTO_IPV6:
+ ret = ip_set_get_ip6_port(skb, src, port, &proto);
+ break;
+ default:
+ return false;
+ }
+ if (!ret)
+ return ret;
+ switch (proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int
bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
@@ -208,7 +231,7 @@ static bool
init_map_port(struct ip_set *set, struct bitmap_port *map,
u16 first_port, u16 last_port)
{
- map->members = ip_set_alloc(map->memsize);
+ map->members = bitmap_zalloc(map->elements, GFP_KERNEL | __GFP_NOWARN);
if (!map->members)
return false;
map->first_port = first_port;
@@ -248,10 +271,10 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
return -ENOMEM;
map->elements = elements;
- map->memsize = bitmap_bytes(0, map->elements);
+ map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long);
set->variant = &bitmap_port;
if (!init_map_port(set, map, first_port, last_port)) {
- kfree(map);
+ ip_set_free(map);
return -ENOMEM;
}
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 45a257695bef..cc20e6d56807 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module for IP set management */
@@ -38,7 +35,7 @@ struct ip_set_net {
static unsigned int ip_set_net_id __read_mostly;
-static inline struct ip_set_net *ip_set_pernet(struct net *net)
+static struct ip_set_net *ip_set_pernet(struct net *net)
{
return net_generic(net, ip_set_net_id);
}
@@ -51,32 +48,35 @@ static unsigned int max_sets;
module_param(max_sets, int, 0600);
MODULE_PARM_DESC(max_sets, "maximal number of sets");
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("core IP set support");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
/* When the nfnl mutex or ip_set_ref_lock is held: */
-#define ip_set_dereference(p) \
- rcu_dereference_protected(p, \
+#define ip_set_dereference(inst) \
+ rcu_dereference_protected((inst)->ip_set_list, \
lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
- lockdep_is_held(&ip_set_ref_lock))
+ lockdep_is_held(&ip_set_ref_lock) || \
+ (inst)->is_deleted)
#define ip_set(inst, id) \
- ip_set_dereference((inst)->ip_set_list)[id]
+ ip_set_dereference(inst)[id]
#define ip_set_ref_netlink(inst,id) \
rcu_dereference_raw((inst)->ip_set_list)[id]
+#define ip_set_dereference_nfnl(p) \
+ rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* The set types are implemented in modules and registered set types
* can be found in ip_set_type_list. Adding/deleting types is
* serialized by ip_set_type_mutex.
*/
-static inline void
+static void
ip_set_type_lock(void)
{
mutex_lock(&ip_set_type_mutex);
}
-static inline void
+static void
ip_set_type_unlock(void)
{
mutex_unlock(&ip_set_type_mutex);
@@ -89,7 +89,8 @@ find_set_type(const char *name, u8 family, u8 revision)
{
struct ip_set_type *type;
- list_for_each_entry_rcu(type, &ip_set_type_list, list)
+ list_for_each_entry_rcu(type, &ip_set_type_list, list,
+ lockdep_is_held(&ip_set_type_mutex))
if (STRNCMP(type->name, name) &&
(type->family == family ||
type->family == NFPROTO_UNSPEC) &&
@@ -103,14 +104,19 @@ find_set_type(const char *name, u8 family, u8 revision)
static bool
load_settype(const char *name)
{
+ if (!try_module_get(THIS_MODULE))
+ return false;
+
nfnl_unlock(NFNL_SUBSYS_IPSET);
pr_debug("try to load ip_set_%s\n", name);
if (request_module("ip_set_%s", name) < 0) {
pr_warn("Can't find ip_set type %s\n", name);
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return false;
}
nfnl_lock(NFNL_SUBSYS_IPSET);
+ module_put(THIS_MODULE);
return true;
}
@@ -252,22 +258,7 @@ EXPORT_SYMBOL_GPL(ip_set_type_unregister);
void *
ip_set_alloc(size_t size)
{
- void *members = NULL;
-
- if (size < KMALLOC_MAX_SIZE)
- members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
- if (members) {
- pr_debug("%p: allocated with kmalloc\n", members);
- return members;
- }
-
- members = vzalloc(size);
- if (!members)
- return NULL;
- pr_debug("%p: allocated with vmalloc\n", members);
-
- return members;
+ return kvzalloc(size, GFP_KERNEL_ACCOUNT);
}
EXPORT_SYMBOL_GPL(ip_set_alloc);
@@ -280,7 +271,7 @@ ip_set_free(void *members)
}
EXPORT_SYMBOL_GPL(ip_set_free);
-static inline bool
+static bool
flag_nested(const struct nlattr *nla)
{
return nla->nla_type & NLA_F_NESTED;
@@ -288,8 +279,7 @@ flag_nested(const struct nlattr *nla)
static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
[IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 },
- [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr) },
+ [IPSET_ATTR_IPADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
};
int
@@ -330,6 +320,83 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+static u32
+ip_set_timeout_get(const unsigned long *timeout)
+{
+ u32 t;
+
+ if (*timeout == IPSET_ELEM_PERMANENT)
+ return 0;
+
+ t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC;
+ /* Zero value in userspace means no timeout */
+ return t == 0 ? 1 : t;
+}
+
+static char *
+ip_set_comment_uget(struct nlattr *tb)
+{
+ return nla_data(tb);
+}
+
+/* Called from uadd only, protected by the set spinlock.
+ * The kadt functions don't use the comment extensions in any way.
+ */
+void
+ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
+ const struct ip_set_ext *ext)
+{
+ struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
+ size_t len = ext->comment ? strlen(ext->comment) : 0;
+
+ if (unlikely(c)) {
+ set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+ kfree_rcu(c, rcu);
+ rcu_assign_pointer(comment->c, NULL);
+ }
+ if (!len)
+ return;
+ if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
+ len = IPSET_MAX_COMMENT_SIZE;
+ c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+ if (unlikely(!c))
+ return;
+ strscpy(c->str, ext->comment, len + 1);
+ set->ext_size += sizeof(*c) + strlen(c->str) + 1;
+ rcu_assign_pointer(comment->c, c);
+}
+EXPORT_SYMBOL_GPL(ip_set_init_comment);
+
+/* Used only when dumping a set, protected by rcu_read_lock() */
+static int
+ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
+{
+ struct ip_set_comment_rcu *c = rcu_dereference(comment->c);
+
+ if (!c)
+ return 0;
+ return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str);
+}
+
+/* Called from uadd/udel, flush or the garbage collectors protected
+ * by the set spinlock.
+ * Called when the set is destroyed and when there can't be any user
+ * of the set data anymore.
+ */
+static void
+ip_set_comment_free(struct ip_set *set, void *ptr)
+{
+ struct ip_set_comment *comment = ptr;
+ struct ip_set_comment_rcu *c;
+
+ c = rcu_dereference_protected(comment->c, 1);
+ if (unlikely(!c))
+ return;
+ set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+ kfree_rcu(c, rcu);
+ rcu_assign_pointer(comment->c, NULL);
+}
+
typedef void (*destroyer)(struct ip_set *, void *);
/* ipset data extension types, in size order */
@@ -356,12 +423,12 @@ const struct ip_set_ext_type ip_set_extensions[] = {
.flag = IPSET_FLAG_WITH_COMMENT,
.len = sizeof(struct ip_set_comment),
.align = __alignof__(struct ip_set_comment),
- .destroy = (destroyer) ip_set_comment_free,
+ .destroy = ip_set_comment_free,
},
};
EXPORT_SYMBOL_GPL(ip_set_extensions);
-static inline bool
+static bool
add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
{
return ip_set_extensions[id].flag ?
@@ -385,6 +452,8 @@ ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
+ if (align < ip_set_extensions[id].align)
+ align = ip_set_extensions[id].align;
len = ALIGN(len, ip_set_extensions[id].align);
set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
@@ -451,6 +520,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
}
EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+static u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+ return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+ return (u64)atomic64_read(&(counter)->packets);
+}
+
+static bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+ return nla_put_net64(skb, IPSET_ATTR_BYTES,
+ cpu_to_be64(ip_set_get_bytes(counter)),
+ IPSET_ATTR_PAD) ||
+ nla_put_net64(skb, IPSET_ATTR_PACKETS,
+ cpu_to_be64(ip_set_get_packets(counter)),
+ IPSET_ATTR_PAD);
+}
+
+static bool
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
+{
+ /* Send nonzero parameters only */
+ return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
+ nla_put_net64(skb, IPSET_ATTR_SKBMARK,
+ cpu_to_be64((u64)skbinfo->skbmark << 32 |
+ skbinfo->skbmarkmask),
+ IPSET_ATTR_PAD)) ||
+ (skbinfo->skbprio &&
+ nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+ cpu_to_be32(skbinfo->skbprio))) ||
+ (skbinfo->skbqueue &&
+ nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+ cpu_to_be16(skbinfo->skbqueue)));
+}
+
int
ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
const void *e, bool active)
@@ -476,6 +585,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
}
EXPORT_SYMBOL_GPL(ip_set_put_extensions);
+static bool
+ip_set_match_counter(u64 counter, u64 match, u8 op)
+{
+ switch (op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == match;
+ case IPSET_COUNTER_NE:
+ return counter != match;
+ case IPSET_COUNTER_LT:
+ return counter < match;
+ case IPSET_COUNTER_GT:
+ return counter > match;
+ }
+ return false;
+}
+
+static void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+ atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+ atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static void
+ip_set_update_counter(struct ip_set_counter *counter,
+ const struct ip_set_ext *ext, u32 flags)
+{
+ if (ext->packets != ULLONG_MAX &&
+ !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
+ ip_set_add_bytes(ext->bytes, counter);
+ ip_set_add_packets(ext->packets, counter);
+ }
+}
+
+static void
+ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ mext->skbinfo = *skbinfo;
+}
+
bool
ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags, void *data)
@@ -486,13 +644,14 @@ ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
if (SET_WITH_COUNTER(set)) {
struct ip_set_counter *counter = ext_counter(data, set);
+ ip_set_update_counter(counter, ext, flags);
+
if (flags & IPSET_FLAG_MATCH_COUNTERS &&
!(ip_set_match_counter(ip_set_get_packets(counter),
mext->packets, mext->packets_op) &&
ip_set_match_counter(ip_set_get_bytes(counter),
mext->bytes, mext->bytes_op)))
return false;
- ip_set_update_counter(counter, ext, flags);
}
if (SET_WITH_SKBINFO(set))
ip_set_get_skbinfo(ext_skbinfo(data, set),
@@ -511,7 +670,7 @@ EXPORT_SYMBOL_GPL(ip_set_match_extensions);
* The set behind an index may change by swapping only, from userspace.
*/
-static inline void
+static void
__ip_set_get(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -519,7 +678,7 @@ __ip_set_get(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock);
}
-static inline void
+static void
__ip_set_put(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -531,7 +690,15 @@ __ip_set_put(struct ip_set *set)
/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
* a separate reference counter
*/
-static inline void
+static void
+__ip_set_get_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref_netlink++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static void
__ip_set_put_netlink(struct ip_set *set)
{
write_lock_bh(&ip_set_ref_lock);
@@ -546,18 +713,27 @@ __ip_set_put_netlink(struct ip_set *set)
* so it can't be destroyed (or changed) under our foot.
*/
-static inline struct ip_set *
+static struct ip_set *
ip_set_rcu_get(struct net *net, ip_set_id_t index)
{
- struct ip_set *set;
struct ip_set_net *inst = ip_set_pernet(net);
- rcu_read_lock();
- /* ip_set_list itself needs to be protected */
- set = rcu_dereference(inst->ip_set_list)[index];
- rcu_read_unlock();
+ /* ip_set_list and the set pointer need to be protected */
+ return ip_set_dereference_nfnl(inst->ip_set_list)[index];
+}
+
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_lock_bh(&set->lock);
+}
- return set;
+static inline void
+ip_set_unlock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_unlock_bh(&set->lock);
}
int
@@ -574,16 +750,14 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return 0;
- rcu_read_lock_bh();
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
- rcu_read_unlock_bh();
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
@@ -612,9 +786,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -634,9 +808,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -675,7 +849,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);
*
*/
-static inline void
+static void
__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
{
struct ip_set *set;
@@ -709,7 +883,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
BUG_ON(!set);
read_lock_bh(&ip_set_ref_lock);
- strncpy(name, set->name, IPSET_MAXNAMELEN);
+ strscpy_pad(name, set->name, IPSET_MAXNAMELEN);
read_unlock_bh(&ip_set_ref_lock);
}
EXPORT_SYMBOL_GPL(ip_set_name_byindex);
@@ -798,20 +972,9 @@ static struct nlmsghdr *
start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
enum ipset_cmd cmd)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
-
- nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd),
- sizeof(*nfmsg), flags);
- if (!nlh)
- return NULL;
-
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = NFPROTO_IPV4;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
- return nlh;
+ return nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags,
+ NFPROTO_IPV4, NFNETLINK_V0, 0);
}
/* Create a set */
@@ -877,26 +1040,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
return 0;
}
-static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
return -EOPNOTSUPP;
}
-static int ip_set_create(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
const char *name, *typename;
u8 family, revision;
- u32 flags = flag_exist(nlh);
+ u32 flags = flag_exist(info->nlh);
int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
@@ -922,7 +1081,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
if (!set)
return -ENOMEM;
spin_lock_init(&set->lock);
- strlcpy(set->name, name, IPSET_MAXNAMELEN);
+ strscpy(set->name, name, IPSET_MAXNAMELEN);
set->family = family;
set->revision = revision;
@@ -944,8 +1103,10 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
ret = -IPSET_ERR_PROTOCOL;
goto put_out;
}
+ /* Set create flags depending on the type revision */
+ set->flags |= set->type->create_flags[revision];
- ret = set->type->create(net, set, tb, flags);
+ ret = set->type->create(info->net, set, tb, flags);
if (ret != 0)
goto put_out;
@@ -978,7 +1139,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
if (!list)
goto cleanup;
/* nfnl mutex is held, both lists are valid */
- tmp = ip_set_dereference(inst->ip_set_list);
+ tmp = ip_set_dereference(inst);
memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
rcu_assign_pointer(inst->ip_set_list, list);
/* Make sure all current packets have passed through */
@@ -999,6 +1160,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl,
return ret;
cleanup:
+ set->variant->cancel_gc(set);
set->variant->destroy(set);
put_out:
module_put(set->type->me);
@@ -1016,23 +1178,56 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
+/* In order to return quickly when destroying a single set, it is split
+ * into two stages:
+ * - Cancel garbage collector
+ * - Destroy the set itself via call_rcu()
+ */
+
static void
-ip_set_destroy_set(struct ip_set *set)
+ip_set_destroy_set_rcu(struct rcu_head *head)
{
- pr_debug("set: %s\n", set->name);
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
- /* Must call it without holding any lock */
set->variant->destroy(set);
module_put(set->type->me);
kfree(set);
}
-static int ip_set_destroy(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static void
+_destroy_all_sets(struct ip_set_net *inst)
+{
+ struct ip_set *set;
+ ip_set_id_t i;
+ bool need_wait = false;
+
+ /* First cancel gc's: set:list sets are flushed as well */
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ set->variant->cancel_gc(set);
+ if (set->type->features & IPSET_TYPE_NAME)
+ need_wait = true;
+ }
+ }
+ /* Must wait for flush to be really finished */
+ if (need_wait)
+ rcu_barrier();
+ for (i = 0; i < inst->ip_set_max; i++) {
+ set = ip_set(inst, i);
+ if (set) {
+ ip_set(inst, i) = NULL;
+ set->variant->destroy(set);
+ module_put(set->type->me);
+ kfree(set);
+ }
+ }
+}
+
+static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
int ret = 0;
@@ -1040,21 +1235,18 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
-
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
- * ip_set_put|get_nfnl_* functions, that way we
+ * ip_set_nfnl_get_* functions, that way we
* can safely check references here.
*
* list:set timer can only decrement the reference
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1064,29 +1256,34 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl,
}
inst->is_destroyed = true;
read_unlock_bh(&ip_set_ref_lock);
- for (i = 0; i < inst->ip_set_max; i++) {
- s = ip_set(inst, i);
- if (s) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(s);
- }
- }
+ _destroy_all_sets(inst);
/* Modified by ip_set_destroy() only, which is serialized */
inst->is_destroyed = false;
} else {
+ u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
- ret = -ENOENT;
+ if (!(flags & IPSET_FLAG_EXIST))
+ ret = -ENOENT;
goto out;
} else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1101,17 +1298,15 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->flush(set);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
}
-static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *s;
ip_set_id_t i;
@@ -1146,12 +1341,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
-static int ip_set_rename(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set, *s;
const char *name2;
ip_set_id_t i;
@@ -1167,7 +1360,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
return -ENOENT;
write_lock_bh(&ip_set_ref_lock);
- if (set->ref != 0) {
+ if (set->ref != 0 || set->ref_netlink != 0) {
ret = -IPSET_ERR_REFERENCED;
goto out;
}
@@ -1180,7 +1373,7 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
goto out;
}
}
- strncpy(set->name, name2, IPSET_MAXNAMELEN);
+ strscpy_pad(set->name, name2, IPSET_MAXNAMELEN);
out:
write_unlock_bh(&ip_set_ref_lock);
@@ -1196,12 +1389,10 @@ out:
* so the ip_set_list always contains valid pointers to the sets.
*/
-static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *from, *to;
ip_set_id_t from_id, to_id;
char from_name[IPSET_MAXNAMELEN];
@@ -1236,9 +1427,9 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return -EBUSY;
}
- strncpy(from_name, from->name, IPSET_MAXNAMELEN);
- strncpy(from->name, to->name, IPSET_MAXNAMELEN);
- strncpy(to->name, from_name, IPSET_MAXNAMELEN);
+ strscpy_pad(from_name, from->name, IPSET_MAXNAMELEN);
+ strscpy_pad(from->name, to->name, IPSET_MAXNAMELEN);
+ strscpy_pad(to->name, from_name, IPSET_MAXNAMELEN);
swap(from->ref, to->ref);
ip_set(inst, from_id) = to;
@@ -1258,6 +1449,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF)
#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16)
+int
+ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
+{
+ u32 cadt_flags = 0;
+
+ if (SET_WITH_TIMEOUT(set))
+ if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(set->timeout))))
+ return -EMSGSIZE;
+ if (SET_WITH_COUNTER(set))
+ cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
+ if (SET_WITH_COMMENT(set))
+ cadt_flags |= IPSET_FLAG_WITH_COMMENT;
+ if (SET_WITH_SKBINFO(set))
+ cadt_flags |= IPSET_FLAG_WITH_SKBINFO;
+ if (SET_WITH_FORCEADD(set))
+ cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
+
+ if (!cadt_flags)
+ return 0;
+ return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
+}
+EXPORT_SYMBOL_GPL(ip_set_put_flags);
+
static int
ip_set_dump_done(struct netlink_callback *cb)
{
@@ -1287,29 +1502,43 @@ dump_attrs(struct nlmsghdr *nlh)
}
}
+static const struct nla_policy
+ip_set_dump_policy[IPSET_ATTR_CMD_MAX + 1] = {
+ [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
+ [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING,
+ .len = IPSET_MAXNAMELEN - 1 },
+ [IPSET_ATTR_FLAGS] = { .type = NLA_U32 },
+};
+
static int
-dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
+ip_set_dump_start(struct netlink_callback *cb)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
struct nlattr *attr = (void *)nlh + min_len;
+ struct sk_buff *skb = cb->skb;
+ struct ip_set_net *inst = ip_set_pernet(sock_net(skb->sk));
u32 dump_type;
- ip_set_id_t index;
+ int ret;
- /* Second pass, so parser can't fail */
- nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len,
- ip_set_setname_policy, NULL);
+ ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, attr,
+ nlh->nlmsg_len - min_len,
+ ip_set_dump_policy, NULL);
+ if (ret)
+ goto error;
cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]);
if (cda[IPSET_ATTR_SETNAME]) {
+ ip_set_id_t index;
struct ip_set *set;
set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
&index);
- if (!set)
- return -ENOENT;
-
+ if (!set) {
+ ret = -ENOENT;
+ goto error;
+ }
dump_type = DUMP_ONE;
cb->args[IPSET_CB_INDEX] = index;
} else {
@@ -1325,10 +1554,17 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
cb->args[IPSET_CB_DUMP] = dump_type;
return 0;
+
+error:
+ /* We have to create and send the error message manually :-( */
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ netlink_ack(cb->skb, nlh, ret, NULL);
+ }
+ return ret;
}
static int
-ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
+ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb)
{
ip_set_id_t index = IPSET_INVALID_ID, max;
struct ip_set *set = NULL;
@@ -1339,18 +1575,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
bool is_destroyed;
int ret = 0;
- if (!cb->args[IPSET_CB_DUMP]) {
- ret = dump_init(cb, inst);
- if (ret < 0) {
- nlh = nlmsg_hdr(cb->skb);
- /* We have to create and send the error message
- * manually :-(
- */
- if (nlh->nlmsg_flags & NLM_F_ACK)
- netlink_ack(cb->skb, nlh, ret, NULL);
- return ret;
- }
- }
+ if (!cb->args[IPSET_CB_DUMP])
+ return -EINVAL;
if (cb->args[IPSET_CB_INDEX] >= inst->ip_set_max)
goto out;
@@ -1429,7 +1655,7 @@ dump_last:
goto next_set;
if (set->variant->uref)
set->variant->uref(set, cb, true);
- /* fall through */
+ fallthrough;
default:
ret = set->variant->list(set, skb, cb);
if (!cb->args[IPSET_CB_ARG0])
@@ -1476,20 +1702,19 @@ out:
return ret < 0 ? ret : skb->len;
}
-static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
{
struct netlink_dump_control c = {
- .dump = ip_set_dump_start,
+ .start = ip_set_dump_start,
+ .dump = ip_set_dump_do,
.done = ip_set_dump_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -1505,8 +1730,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = {
};
static int
-call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
- struct nlattr *tb[], enum ipset_adt adt,
+call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt,
u32 flags, bool use_lineno)
{
int ret;
@@ -1514,13 +1739,22 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
- spin_lock_bh(&set->lock);
+ if (retried) {
+ __ip_set_get_netlink(set);
+ nfnl_unlock(NFNL_SUBSYS_IPSET);
+ cond_resched();
+ nfnl_lock(NFNL_SUBSYS_IPSET);
+ __ip_set_put_netlink(set);
+ }
+
+ ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
retried = true;
- } while (ret == -EAGAIN &&
- set->variant->resize &&
- (ret = set->variant->resize(set, retried)) == 0);
+ } while (ret == -ERANGE ||
+ (ret == -EAGAIN &&
+ set->variant->resize &&
+ (ret = set->variant->resize(set, retried)) == 0));
if (!ret || (ret == -IPSET_ERR_EXIST && eexist))
return 0;
@@ -1539,22 +1773,28 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
skb2 = nlmsg_new(payload, GFP_KERNEL);
if (!skb2)
return -ENOMEM;
- rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
+ rep = nlmsg_put(skb2, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = ret;
- memcpy(&errmsg->msg, nlh, nlh->nlmsg_len);
+ unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len,
+ /* Bounds checked by the skb layer. */);
+
cmdattr = (void *)&errmsg->msg + min_len;
- nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
- nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL);
+ ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr,
+ nlh->nlmsg_len - min_len, ip_set_adt_policy,
+ NULL);
+ if (ret) {
+ nlmsg_free(skb2);
+ return ret;
+ }
errline = nla_data(cda[IPSET_ATTR_LINENO]);
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
/* Signal netlink not to send its ACK/errmsg. */
return -EINTR;
}
@@ -1562,10 +1802,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
return ret;
}
-static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_ad(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ enum ipset_adt adt,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[],
+ struct netlink_ext_ack *extack)
{
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
@@ -1596,19 +1838,18 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
attr[IPSET_ATTR_DATA],
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags,
+ ret = call_ad(net, ctnl, skb, set, tb, adt, flags,
use_lineno);
} else {
int nla_rem;
nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(tb));
if (nla_type(nla) != IPSET_ATTR_DATA ||
!flag_nested(nla) ||
nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
set->type->adt_policy, NULL))
return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_ADD,
+ ret = call_ad(net, ctnl, skb, set, tb, adt,
flags, use_lineno);
if (ret < 0)
return ret;
@@ -1617,70 +1858,28 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set;
- struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
- const struct nlattr *nla;
- u32 flags = flag_exist(nlh);
- bool use_lineno;
- int ret = 0;
-
- if (unlikely(protocol_min_failed(attr) ||
- !attr[IPSET_ATTR_SETNAME] ||
- !((attr[IPSET_ATTR_DATA] != NULL) ^
- (attr[IPSET_ATTR_ADT] != NULL)) ||
- (attr[IPSET_ATTR_DATA] &&
- !flag_nested(attr[IPSET_ATTR_DATA])) ||
- (attr[IPSET_ATTR_ADT] &&
- (!flag_nested(attr[IPSET_ATTR_ADT]) ||
- !attr[IPSET_ATTR_LINENO]))))
- return -IPSET_ERR_PROTOCOL;
-
- set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
- if (!set)
- return -ENOENT;
-
- use_lineno = !!attr[IPSET_ATTR_LINENO];
- if (attr[IPSET_ATTR_DATA]) {
- if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX,
- attr[IPSET_ATTR_DATA],
- set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags,
- use_lineno);
- } else {
- int nla_rem;
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_ADD, info->nlh, attr, info->extack);
+}
- nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) {
- memset(tb, 0, sizeof(*tb));
- if (nla_type(nla) != IPSET_ATTR_DATA ||
- !flag_nested(nla) ||
- nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla,
- set->type->adt_policy, NULL))
- return -IPSET_ERR_PROTOCOL;
- ret = call_ad(ctnl, skb, set, tb, IPSET_DEL,
- flags, use_lineno);
- if (ret < 0)
- return ret;
- }
- }
- return ret;
+static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
+{
+ return ip_set_ad(info->net, info->sk, skb,
+ IPSET_DEL, info->nlh, attr, info->extack);
}
-static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
+ u32 lineno;
if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME] ||
@@ -1697,7 +1896,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
return -IPSET_ERR_PROTOCOL;
rcu_read_lock_bh();
- ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
+ ret = set->variant->uadt(set, tb, IPSET_TEST, &lineno, 0, 0);
rcu_read_unlock_bh();
/* Userspace can't trigger element to be re-added */
if (ret == -EAGAIN)
@@ -1708,16 +1907,13 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
/* Get headed data of a set */
-static int ip_set_header(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
const struct ip_set *set;
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(protocol_min_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -1731,7 +1927,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_HEADER);
if (!nlh2)
goto nlmsg_failure;
@@ -1743,11 +1939,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1765,10 +1957,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
};
-static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
@@ -1791,7 +1981,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_TYPE);
if (!nlh2)
goto nlmsg_failure;
@@ -1804,11 +1994,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
nlmsg_end(skb2, nlh2);
pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1824,14 +2010,11 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
};
-static int ip_set_protocol(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
- int ret = 0;
if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
return -IPSET_ERR_PROTOCOL;
@@ -1840,7 +2023,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_PROTOCOL);
if (!nlh2)
goto nlmsg_failure;
@@ -1850,11 +2033,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1865,17 +2044,14 @@ nlmsg_failure:
/* Get set by name or index, from userspace */
-static int ip_set_byname(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_SETNAME]))
@@ -1889,7 +2065,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYNAME);
if (!nlh2)
goto nlmsg_failure;
@@ -1899,11 +2075,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1917,17 +2089,14 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_INDEX] = { .type = NLA_U16 },
};
-static int ip_set_byindex(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const attr[],
- struct netlink_ext_ack *extack)
+static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(net);
+ struct ip_set_net *inst = ip_set_pernet(info->net);
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
ip_set_id_t id = IPSET_INVALID_ID;
const struct ip_set *set;
- int ret = 0;
if (unlikely(protocol_failed(attr) ||
!attr[IPSET_ATTR_INDEX]))
@@ -1944,7 +2113,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
if (!skb2)
return -ENOMEM;
- nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
+ nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0,
IPSET_CMD_GET_BYINDEX);
if (!nlh2)
goto nlmsg_failure;
@@ -1953,11 +2122,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl,
goto nla_put_failure;
nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret < 0)
- return ret;
-
- return 0;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
nla_put_failure:
nlmsg_cancel(skb2, nlh2);
@@ -1969,80 +2134,96 @@ nlmsg_failure:
static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = {
[IPSET_CMD_NONE] = {
.call = ip_set_none,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
},
[IPSET_CMD_CREATE] = {
.call = ip_set_create,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_create_policy,
},
[IPSET_CMD_DESTROY] = {
.call = ip_set_destroy,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_FLUSH] = {
.call = ip_set_flush,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_RENAME] = {
.call = ip_set_rename,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_SWAP] = {
.call = ip_set_swap,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname2_policy,
},
[IPSET_CMD_LIST] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
- .policy = ip_set_setname_policy,
+ .policy = ip_set_dump_policy,
},
[IPSET_CMD_SAVE] = {
.call = ip_set_dump,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_ADD] = {
.call = ip_set_uadd,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_DEL] = {
.call = ip_set_udel,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_TEST] = {
.call = ip_set_utest,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_adt_policy,
},
[IPSET_CMD_HEADER] = {
.call = ip_set_header,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_TYPE] = {
.call = ip_set_type,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_type_policy,
},
[IPSET_CMD_PROTOCOL] = {
.call = ip_set_protocol,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_protocol_policy,
},
[IPSET_CMD_GET_BYNAME] = {
.call = ip_set_byname,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_setname_policy,
},
[IPSET_CMD_GET_BYINDEX] = {
.call = ip_set_byindex,
+ .type = NFNL_CB_MUTEX,
.attr_count = IPSET_ATTR_CMD_MAX,
.policy = ip_set_index_policy,
},
@@ -2107,8 +2288,9 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
}
req_version->version = IPSET_PROTOCOL;
- ret = copy_to_user(user, req_version,
- sizeof(struct ip_set_req_version));
+ if (copy_to_user(user, req_version,
+ sizeof(struct ip_set_req_version)))
+ ret = -EFAULT;
goto done;
}
case IP_SET_OP_GET_BYNAME: {
@@ -2167,7 +2349,8 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
} /* end of switch(op) */
copy:
- ret = copy_to_user(user, data, copylen);
+ if (copy_to_user(user, data, copylen))
+ ret = -EFAULT;
done:
vfree(data);
@@ -2204,29 +2387,25 @@ ip_set_net_init(struct net *net)
}
static void __net_exit
-ip_set_net_exit(struct net *net)
+ip_set_net_pre_exit(struct net *net)
{
struct ip_set_net *inst = ip_set_pernet(net);
- struct ip_set *set = NULL;
- ip_set_id_t i;
-
inst->is_deleted = true; /* flag for ip_set_nfnl_put */
+}
- nfnl_lock(NFNL_SUBSYS_IPSET);
- for (i = 0; i < inst->ip_set_max; i++) {
- set = ip_set(inst, i);
- if (set) {
- ip_set(inst, i) = NULL;
- ip_set_destroy_set(set);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_IPSET);
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+ struct ip_set_net *inst = ip_set_pernet(net);
+
+ _destroy_all_sets(inst);
kvfree(rcu_dereference_protected(inst->ip_set_list, 1));
}
static struct pernet_operations ip_set_net_ops = {
.init = ip_set_net_init,
+ .pre_exit = ip_set_net_pre_exit,
.exit = ip_set_net_exit,
.id = &ip_set_net_id,
.size = sizeof(struct ip_set_net),
@@ -2265,8 +2444,11 @@ ip_set_fini(void)
{
nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
unregister_pernet_subsys(&ip_set_net_ops);
+
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
pr_debug("these are the famous last words\n");
}
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 3f09cdb42562..36615eb3eae1 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -147,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
}
EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
#endif
-
-bool
-ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
-{
- bool ret;
- u8 proto;
-
- switch (pf) {
- case NFPROTO_IPV4:
- ret = ip_set_get_ip4_port(skb, src, port, &proto);
- break;
- case NFPROTO_IPV6:
- ret = ip_set_get_ip6_port(skb, src, port, &proto);
- break;
- default:
- return false;
- }
- if (!ret)
- return ret;
- switch (proto) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- return true;
- default:
- return false;
- }
-}
-EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 2c9609929c71..5e4453e9ef8e 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1,23 +1,28 @@
-/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
#ifndef _IP_SET_HASH_GEN_H
#define _IP_SET_HASH_GEN_H
#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/jhash.h>
#include <linux/types.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
-
-#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
-#define ipset_dereference_protected(p, set) \
- __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
-
-#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/ipset/ip_set.h>
+
+#define __ipset_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ipset_dereference_nfnl(p) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+#define ipset_dereference_set(p, set) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+ lockdep_is_held(&(set)->lock))
+#define ipset_dereference_bh_nfnl(p) \
+ rcu_dereference_bh_check(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* Hashing which uses arrays to resolve clashing. The hash table is resized
* (doubled) when searching becomes too long.
@@ -33,37 +38,12 @@
*/
/* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE 4
+#define AHASH_INIT_SIZE 2
/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE (3 * AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE (6 * AHASH_INIT_SIZE)
/* Max muber of elements in the array block when tuned */
#define AHASH_MAX_TUNED 64
-
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h) ((h)->ahash_max)
-
-static inline u8
-tune_ahash_max(u8 curr, u32 multi)
-{
- u32 n;
-
- if (multi < curr)
- return curr;
-
- n = curr + AHASH_INIT_SIZE;
- /* Currently, at listing one hash bucket must fit into a message.
- * Therefore we have a hard limit here.
- */
- return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
-}
-
-#define TUNE_AHASH_MAX(h, multi) \
- ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
-#else
-#define AHASH_MAX(h) AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
-#endif
+#define AHASH_MAX(h) ((h)->bucketsize)
/* A hash bucket */
struct hbucket {
@@ -72,16 +52,40 @@ struct hbucket {
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
- unsigned char value[0] /* the array of the values */
+ unsigned char value[] /* the array of the values */
__aligned(__alignof__(u64));
};
+/* Region size for locking == 2^HTABLE_REGION_BITS */
+#define HTABLE_REGION_BITS 10
+#define ahash_numof_locks(htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 1 \
+ : jhash_size((htable_bits) - HTABLE_REGION_BITS))
+#define ahash_sizeof_regions(htable_bits) \
+ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
+#define ahash_region(n) \
+ ((n) / jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_start(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 0 \
+ : (h) * jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_end(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \
+ : ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
+
+struct htable_gc {
+ struct delayed_work dwork;
+ struct ip_set *set; /* Set the gc belongs to */
+ u32 region; /* Last gc run position */
+};
+
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
atomic_t ref; /* References for resizing */
- atomic_t uref; /* References for dumping */
+ atomic_t uref; /* References for dumping and gc */
u8 htable_bits; /* size of hash table == 2^htable_bits */
- struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+ u32 maxelem; /* Maxelem per region */
+ struct ip_set_region *hregion; /* Region locks and ext sizes */
+ struct hbucket __rcu *bucket[]; /* hashtable buckets */
};
#define hbucket(h, i) ((h)->bucket[i])
@@ -104,31 +108,17 @@ htable_size(u8 hbits)
{
size_t hsize;
- /* We must fit both into u32 in jhash and size_t */
+ /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */
if (hbits > 31)
return 0;
hsize = jhash_size(hbits);
- if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *)
+ if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *)
< hsize)
return 0;
return hsize * sizeof(struct hbucket *) + sizeof(struct htable);
}
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
- /* Assume that hashsize == 2^htable_bits */
- u8 bits = fls(hashsize - 1);
-
- if (jhash_size(bits) != hashsize)
- /* Round up to the first 2^n value */
- bits = fls(hashsize);
-
- return bits;
-}
-
#ifdef IP_SET_HASH_WITH_NETS
#if IPSET_NET_COUNT > 1
#define __CIDR(cidr, i) (cidr[i])
@@ -166,6 +156,21 @@ htable_bits(u32 hashsize)
#define NLEN 0
#endif /* IP_SET_HASH_WITH_NETS */
+#define SET_ELEM_EXPIRED(set, d) \
+ (SET_WITH_TIMEOUT(set) && \
+ ip_set_timeout_expired(ext_timeout(d, set)))
+
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+static const union nf_inet_addr onesmask = {
+ .all[0] = 0xffffffff,
+ .all[1] = 0xffffffff,
+ .all[2] = 0xffffffff,
+ .all[3] = 0xffffffff
+};
+
+static const union nf_inet_addr zeromask = {};
+#endif
+
#endif /* _IP_SET_HASH_GEN_H */
#ifndef MTYPE
@@ -209,12 +214,15 @@ htable_bits(u32 hashsize)
#undef mtype_test_cidrs
#undef mtype_test
#undef mtype_uref
-#undef mtype_expire
#undef mtype_resize
+#undef mtype_ext_size
+#undef mtype_resize_ad
#undef mtype_head
#undef mtype_list
+#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
+#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match
@@ -251,12 +259,15 @@ htable_bits(u32 hashsize)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
#define mtype_test IPSET_TOKEN(MTYPE, _test)
#define mtype_uref IPSET_TOKEN(MTYPE, _uref)
-#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size)
+#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
@@ -279,34 +290,44 @@ htable_bits(u32 hashsize)
/* The generic hash structure */
struct htype {
struct htable __rcu *table; /* the hash table */
- struct timer_list gc; /* garbage collection when timeout enabled */
- struct ip_set *set; /* attached to this ip_set */
+ struct htable_gc gc; /* gc workqueue */
u32 maxelem; /* max elements in the hash */
u32 initval; /* random jhash init value */
#ifdef IP_SET_HASH_WITH_MARKMASK
u32 markmask; /* markmask value for mark mask to store */
#endif
-#ifdef IP_SET_HASH_WITH_MULTI
- u8 ahash_max; /* max elements in an array block */
-#endif
-#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 bucketsize; /* max elements in an array block */
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
u8 netmask; /* netmask value for subnets to store */
+ union nf_inet_addr bitmask; /* stores bitmask */
#endif
+ struct list_head ad; /* Resize add|del backlist */
struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_NETS
struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
#endif
};
+/* ADD|DEL entries saved during resize */
+struct mtype_resize_ad {
+ struct list_head list;
+ enum ipset_adt ad; /* ADD|DEL element */
+ struct mtype_elem d; /* Element value */
+ struct ip_set_ext ext; /* Extensions for ADD */
+ struct ip_set_ext mext; /* Target extensions for ADD */
+ u32 flags; /* Flags for ADD */
+};
+
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
* sized networks. cidr == real cidr + 1 to support /0.
*/
static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
int i, j;
+ spin_lock_bh(&set->lock);
/* Add in increasing prefix order, so larger cidr first */
for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
if (j != -1) {
@@ -315,7 +336,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
j = i;
} else if (h->nets[i].cidr[n] == cidr) {
h->nets[CIDR_POS(cidr)].nets[n]++;
- return;
+ goto unlock;
}
}
if (j != -1) {
@@ -324,24 +345,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
}
h->nets[i].cidr[n] = cidr;
h->nets[CIDR_POS(cidr)].nets[n] = 1;
+unlock:
+ spin_unlock_bh(&set->lock);
}
static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
u8 i, j, net_end = NLEN - 1;
+ spin_lock_bh(&set->lock);
for (i = 0; i < NLEN; i++) {
if (h->nets[i].cidr[n] != cidr)
continue;
h->nets[CIDR_POS(cidr)].nets[n]--;
if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
- return;
+ goto unlock;
for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
h->nets[j].cidr[n] = 0;
- return;
+ goto unlock;
}
+unlock:
+ spin_unlock_bh(&set->lock);
}
#endif
@@ -349,7 +375,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
static size_t
mtype_ahash_memsize(const struct htype *h, const struct htable *t)
{
- return sizeof(*h) + sizeof(*t);
+ return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
}
/* Get the ith element from the array block n */
@@ -373,24 +399,29 @@ mtype_flush(struct ip_set *set)
struct htype *h = set->data;
struct htable *t;
struct hbucket *n;
- u32 i;
-
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
- if (!n)
- continue;
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set, n);
- /* FIXME: use slab cache */
- rcu_assign_pointer(hbucket(t, i), NULL);
- kfree_rcu(n, rcu);
+ u32 r, i;
+
+ t = ipset_dereference_nfnl(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ }
+ t->hregion[r].ext_size = 0;
+ t->hregion[r].elements = 0;
+ spin_unlock_bh(&t->hregion[r].lock);
}
#ifdef IP_SET_HASH_WITH_NETS
memset(h->nets, 0, sizeof(h->nets));
#endif
- set->elements = 0;
- set->ext_size = 0;
}
/* Destroy the hashtable part of the set */
@@ -401,7 +432,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ n = (__force struct hbucket *)hbucket(t, i);
if (!n)
continue;
if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -410,6 +441,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
kfree(n);
}
+ ip_set_free(t->hregion);
ip_set_free(t);
}
@@ -418,28 +450,18 @@ static void
mtype_destroy(struct ip_set *set)
{
struct htype *h = set->data;
+ struct list_head *l, *lt;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&h->gc);
-
- mtype_ahash_destroy(set,
- __ipset_dereference_protected(h->table, 1), true);
+ mtype_ahash_destroy(set, (__force struct htable *)h->table, true);
+ list_for_each_safe(l, lt, &h->ad) {
+ list_del(l);
+ kfree(l);
+ }
kfree(h);
set->data = NULL;
}
-static void
-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
-{
- struct htype *h = set->data;
-
- timer_setup(&h->gc, gc, 0);
- mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
- pr_debug("gc initialized, run in every %u\n",
- IPSET_GC_PERIOD(set->timeout));
-}
-
static bool
mtype_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -449,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
/* Resizing changes htable_bits, so we ignore it */
return x->maxelem == y->maxelem &&
a->timeout == b->timeout &&
-#ifdef IP_SET_HASH_WITH_NETMASK
- x->netmask == y->netmask &&
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ nf_inet_addr_cmp(&x->bitmask, &y->bitmask) &&
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
x->markmask == y->markmask &&
@@ -458,11 +480,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
-/* Delete expired elements from the hashtable */
static void
-mtype_expire(struct ip_set *set, struct htype *h)
+mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
{
- struct htable *t;
struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
@@ -470,10 +490,12 @@ mtype_expire(struct ip_set *set, struct htype *h)
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
+ u8 htable_bits = t->htable_bits;
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, htable_bits);
+ i < ahash_bucket_end(r, htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
for (j = 0, d = 0; j < n->pos; j++) {
@@ -489,58 +511,109 @@ mtype_expire(struct ip_set *set, struct htype *h)
smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, k)),
k);
#endif
+ t->hregion[r].elements--;
ip_set_ext_destroy(set, data);
- set->elements--;
d++;
}
if (d >= AHASH_INIT_SIZE) {
if (d >= n->size) {
+ t->hregion[r].ext_size -=
+ ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, i), NULL);
kfree_rcu(n, rcu);
continue;
}
tmp = kzalloc(sizeof(*tmp) +
- (n->size - AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
- /* Still try to delete expired elements */
+ /* Still try to delete expired elements. */
continue;
tmp->size = n->size - AHASH_INIT_SIZE;
for (j = 0, d = 0; j < n->pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
- memcpy(tmp->value + d * dsize, data, dsize);
+ memcpy(tmp->value + d * dsize,
+ data, dsize);
set_bit(d, tmp->used);
d++;
}
tmp->pos = d;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, i), tmp);
kfree_rcu(n, rcu);
}
}
+ spin_unlock_bh(&t->hregion[r].lock);
}
static void
-mtype_gc(struct timer_list *t)
+mtype_gc(struct work_struct *work)
{
- struct htype *h = from_timer(h, t, gc);
- struct ip_set *set = h->set;
+ struct htable_gc *gc;
+ struct ip_set *set;
+ struct htype *h;
+ struct htable *t;
+ u32 r, numof_locks;
+ unsigned int next_run;
+
+ gc = container_of(work, struct htable_gc, dwork.work);
+ set = gc->set;
+ h = set->data;
- pr_debug("called\n");
spin_lock_bh(&set->lock);
- mtype_expire(set, h);
+ t = ipset_dereference_set(h->table, set);
+ atomic_inc(&t->uref);
+ numof_locks = ahash_numof_locks(t->htable_bits);
+ r = gc->region++;
+ if (r >= numof_locks) {
+ r = gc->region = 0;
+ }
+ next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
+ if (next_run < HZ/10)
+ next_run = HZ/10;
spin_unlock_bh(&set->lock);
- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&h->gc);
+ mtype_gc_do(set, h, t, r);
+
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by expire: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
+
}
+static void
+mtype_gc_init(struct htable_gc *gc)
+{
+ INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
+}
+
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ cancel_delayed_work_sync(&h->gc.dwork);
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+
/* Resize a hash: create a new hash table with doubling the hashsize
* and inserting the elements to it. Repeat until we succeed or
* fail due to memory pressures.
@@ -551,7 +624,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t extsize, dsize = set->dsize;
+ size_t hsize, dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -559,7 +632,9 @@ mtype_resize(struct ip_set *set, bool retried)
struct mtype_elem *data;
struct mtype_elem *d;
struct hbucket *n, *m;
- u32 i, j, key;
+ struct list_head *l, *lt;
+ struct mtype_resize_ad *x;
+ u32 i, j, r, nr, key;
int ret;
#ifdef IP_SET_HASH_WITH_NETS
@@ -567,108 +642,140 @@ mtype_resize(struct ip_set *set, bool retried)
if (!tmp)
return -ENOMEM;
#endif
- rcu_read_lock_bh();
- orig = rcu_dereference_bh_nfnl(h->table);
+ orig = ipset_dereference_bh_nfnl(h->table);
htable_bits = orig->htable_bits;
- rcu_read_unlock_bh();
retry:
ret = 0;
htable_bits++;
- if (!htable_bits) {
- /* In case we have plenty of memory :-) */
- pr_warn("Cannot increase the hashsize of set %s further\n",
- set->name);
- ret = -IPSET_ERR_HASH_FULL;
+ if (!htable_bits)
+ goto hbwarn;
+ hsize = htable_size(htable_bits);
+ if (!hsize)
+ goto hbwarn;
+ t = ip_set_alloc(hsize);
+ if (!t) {
+ ret = -ENOMEM;
goto out;
}
- t = ip_set_alloc(htable_size(htable_bits));
- if (!t) {
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
+ if (!t->hregion) {
+ ip_set_free(t);
ret = -ENOMEM;
goto out;
}
t->htable_bits = htable_bits;
+ t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
+ for (i = 0; i < ahash_numof_locks(htable_bits); i++)
+ spin_lock_init(&t->hregion[i].lock);
- spin_lock_bh(&set->lock);
- orig = __ipset_dereference_protected(h->table, 1);
- /* There can't be another parallel resizing, but dumping is possible */
+ /* There can't be another parallel resizing,
+ * but dumping, gc, kernel side add/del are possible
+ */
+ orig = ipset_dereference_bh_nfnl(h->table);
atomic_set(&orig->ref, 1);
atomic_inc(&orig->uref);
- extsize = 0;
pr_debug("attempt to resize set %s from %u to %u, t %p\n",
set->name, orig->htable_bits, htable_bits, orig);
- for (i = 0; i < jhash_size(orig->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(orig, i), 1);
- if (!n)
- continue;
- for (j = 0; j < n->pos; j++) {
- if (!test_bit(j, n->used))
+ for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
+ /* Expire may replace a hbucket with another one */
+ rcu_read_lock_bh();
+ for (i = ahash_bucket_start(r, orig->htable_bits);
+ i < ahash_bucket_end(r, orig->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(orig, i));
+ if (!n)
continue;
- data = ahash_data(n, j, dsize);
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ if (SET_ELEM_EXPIRED(set, data))
+ continue;
#ifdef IP_SET_HASH_WITH_NETS
- /* We have readers running parallel with us,
- * so the live data cannot be modified.
- */
- flags = 0;
- memcpy(tmp, data, dsize);
- data = tmp;
- mtype_data_reset_flags(data, &flags);
+ /* We have readers running parallel with us,
+ * so the live data cannot be modified.
+ */
+ flags = 0;
+ memcpy(tmp, data, dsize);
+ data = tmp;
+ mtype_data_reset_flags(data, &flags);
#endif
- key = HKEY(data, h->initval, htable_bits);
- m = __ipset_dereference_protected(hbucket(t, key), 1);
- if (!m) {
- m = kzalloc(sizeof(*m) +
+ key = HKEY(data, h->initval, htable_bits);
+ m = __ipset_dereference(hbucket(t, key));
+ nr = ahash_region(key);
+ if (!m) {
+ m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
GFP_ATOMIC);
- if (!m) {
- ret = -ENOMEM;
- goto cleanup;
- }
- m->size = AHASH_INIT_SIZE;
- extsize = ext_size(AHASH_INIT_SIZE, dsize);
- RCU_INIT_POINTER(hbucket(t, key), m);
- } else if (m->pos >= m->size) {
- struct hbucket *ht;
-
- if (m->size >= AHASH_MAX(h)) {
- ret = -EAGAIN;
- } else {
- ht = kzalloc(sizeof(*ht) +
+ if (!m) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ m->size = AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ RCU_INIT_POINTER(hbucket(t, key), m);
+ } else if (m->pos >= m->size) {
+ struct hbucket *ht;
+
+ if (m->size >= AHASH_MAX(h)) {
+ ret = -EAGAIN;
+ } else {
+ ht = kzalloc(sizeof(*ht) +
(m->size + AHASH_INIT_SIZE)
* dsize,
GFP_ATOMIC);
- if (!ht)
- ret = -ENOMEM;
+ if (!ht)
+ ret = -ENOMEM;
+ }
+ if (ret < 0)
+ goto cleanup;
+ memcpy(ht, m, sizeof(struct hbucket) +
+ m->size * dsize);
+ ht->size = m->size + AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ kfree(m);
+ m = ht;
+ RCU_INIT_POINTER(hbucket(t, key), ht);
}
- if (ret < 0)
- goto cleanup;
- memcpy(ht, m, sizeof(struct hbucket) +
- m->size * dsize);
- ht->size = m->size + AHASH_INIT_SIZE;
- extsize += ext_size(AHASH_INIT_SIZE, dsize);
- kfree(m);
- m = ht;
- RCU_INIT_POINTER(hbucket(t, key), ht);
- }
- d = ahash_data(m, m->pos, dsize);
- memcpy(d, data, dsize);
- set_bit(m->pos++, m->used);
+ d = ahash_data(m, m->pos, dsize);
+ memcpy(d, data, dsize);
+ set_bit(m->pos++, m->used);
+ t->hregion[nr].elements++;
#ifdef IP_SET_HASH_WITH_NETS
- mtype_data_reset_flags(d, &flags);
+ mtype_data_reset_flags(d, &flags);
#endif
+ }
}
+ rcu_read_unlock_bh();
}
- rcu_assign_pointer(h->table, t);
- set->ext_size = extsize;
- spin_unlock_bh(&set->lock);
+ /* There can't be any other writer. */
+ rcu_assign_pointer(h->table, t);
/* Give time to other readers of the set */
synchronize_rcu();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
- /* If there's nobody else dumping the table, destroy it */
+ /* Add/delete elements processed by the SET target during resize.
+ * Kernel-side add cannot trigger a resize and userspace actions
+ * are serialized by the mutex.
+ */
+ list_for_each_safe(l, lt, &h->ad) {
+ x = list_entry(l, struct mtype_resize_ad, list);
+ if (x->ad == IPSET_ADD) {
+ mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
+ } else {
+ mtype_del(set, &x->d, NULL, NULL, 0);
+ }
+ list_del(l);
+ kfree(l);
+ }
+ /* If there's nobody else using the table, destroy it */
if (atomic_dec_and_test(&orig->uref)) {
pr_debug("Table destroy by resize %p\n", orig);
mtype_ahash_destroy(set, orig, false);
@@ -681,13 +788,48 @@ out:
return ret;
cleanup:
+ rcu_read_unlock_bh();
atomic_set(&orig->ref, 0);
atomic_dec(&orig->uref);
- spin_unlock_bh(&set->lock);
mtype_ahash_destroy(set, t, false);
if (ret == -EAGAIN)
goto retry;
goto out;
+
+hbwarn:
+ /* In case we have plenty of memory :-) */
+ pr_warn("Cannot increase the hashsize of set %s further\n", set->name);
+ ret = -IPSET_ERR_HASH_FULL;
+ goto out;
+}
+
+/* Get the current number of elements and ext_size in the set */
+static void
+mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
+{
+ struct htype *h = set->data;
+ const struct htable *t;
+ u32 i, j, r;
+ struct hbucket *n;
+ struct mtype_elem *data;
+
+ t = rcu_dereference_bh(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = rcu_dereference_bh(hbucket(t, i));
+ if (!n)
+ continue;
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, set->dsize);
+ if (!SET_ELEM_EXPIRED(set, data))
+ (*elements)++;
+ }
+ }
+ *ext_size += t->hregion[r].ext_size;
+ }
}
/* Add an element to a hash and update the internal counters when succeeded,
@@ -702,32 +844,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n, *old = ERR_PTR(-ENOENT);
- int i, j = -1;
+ int i, j = -1, ret;
bool flag_exist = flags & IPSET_FLAG_EXIST;
bool deleted = false, forceadd = false, reuse = false;
- u32 key, multi = 0;
+ u32 r, key, multi = 0, elements, maxelem;
- if (set->elements >= h->maxelem) {
- if (SET_WITH_TIMEOUT(set))
- /* FIXME: when set is full, we slow down here */
- mtype_expire(set, h);
- if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ r = ahash_region(key);
+ atomic_inc(&t->uref);
+ elements = t->hregion[r].elements;
+ maxelem = t->maxelem;
+ if (elements >= maxelem) {
+ u32 e;
+ if (SET_WITH_TIMEOUT(set)) {
+ rcu_read_unlock_bh();
+ mtype_gc_do(set, h, t, r);
+ rcu_read_lock_bh();
+ }
+ maxelem = h->maxelem;
+ elements = 0;
+ for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
+ elements += t->hregion[e].elements;
+ if (elements >= maxelem && SET_WITH_FORCEADD(set))
forceadd = true;
}
+ rcu_read_unlock_bh();
- t = ipset_dereference_protected(h->table, set);
- key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n) {
- if (forceadd || set->elements >= h->maxelem)
+ if (forceadd || elements >= maxelem)
goto set_full;
old = NULL;
n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
n->size = AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
for (i = 0; i < n->pos; i++) {
@@ -741,67 +900,75 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi)) {
- if (flag_exist ||
- (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))) {
+ if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
/* Just the extensions could be overwritten */
j = i;
goto overwrite_extensions;
}
- return -IPSET_ERR_EXIST;
+ ret = -IPSET_ERR_EXIST;
+ goto unlock;
}
/* Reuse first timed out entry */
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)) &&
- j == -1) {
+ if (SET_ELEM_EXPIRED(set, data) && j == -1) {
j = i;
reuse = true;
}
}
if (reuse || forceadd) {
+ if (j == -1)
+ j = 0;
data = ahash_data(n, j, set->dsize);
if (!deleted) {
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, i)),
i);
#endif
ip_set_ext_destroy(set, data);
- set->elements--;
+ t->hregion[r].elements--;
}
goto copy_data;
}
- if (set->elements >= h->maxelem)
+ if (elements >= maxelem)
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
- TUNE_AHASH_MAX(h, multi);
+#ifdef IP_SET_HASH_WITH_MULTI
+ if (h->bucketsize >= AHASH_MAX_TUNED)
+ goto set_full;
+ else if (h->bucketsize <= multi)
+ h->bucketsize += AHASH_INIT_SIZE;
+#endif
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto resize;
}
old = n;
n = kzalloc(sizeof(*n) +
(old->size + AHASH_INIT_SIZE) * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
memcpy(n, old, sizeof(struct hbucket) +
old->size * set->dsize);
n->size = old->size + AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
}
copy_elem:
j = n->pos++;
data = ahash_data(n, j, set->dsize);
copy_data:
- set->elements++;
+ t->hregion[r].elements++;
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+ mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
#endif
memcpy(data, d, sizeof(struct mtype_elem));
overwrite_extensions:
@@ -824,13 +991,41 @@ overwrite_extensions:
if (old)
kfree_rcu(old, rcu);
}
+ ret = 0;
+resize:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side add, save values */
+ struct mtype_resize_ad *x;
+
+ x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
+ if (!x)
+ /* Don't bother */
+ goto out;
+ x->ad = IPSET_ADD;
+ memcpy(&x->d, value, sizeof(struct mtype_elem));
+ memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
+ memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
+ x->flags = flags;
+ spin_lock_bh(&set->lock);
+ list_add_tail(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ goto out;
- return 0;
set_full:
if (net_ratelimit())
pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- return -IPSET_ERR_HASH_FULL;
+ set->name, maxelem);
+ ret = -IPSET_ERR_HASH_FULL;
+unlock:
+ spin_unlock_bh(&t->hregion[r].lock);
+out:
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by add: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+ return ret;
}
/* Delete an element from the hash and free up space if possible.
@@ -844,13 +1039,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n;
- int i, j, k, ret = -IPSET_ERR_EXIST;
+ struct mtype_resize_ad *x = NULL;
+ int i, j, k, r, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
size_t dsize = set->dsize;
- t = ipset_dereference_protected(h->table, set);
+ /* Userspace add and resize is excluded by the mutex.
+ * Kernespace add does not trigger resize.
+ */
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ r = ahash_region(key);
+ atomic_inc(&t->uref);
+ rcu_read_unlock_bh();
+
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n)
goto out;
for (i = 0, k = 0; i < n->pos; i++) {
@@ -861,8 +1066,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
data = ahash_data(n, i, dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))
+ if (SET_ELEM_EXPIRED(set, data))
goto out;
ret = 0;
@@ -870,20 +1074,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
smp_mb__after_atomic();
if (i + 1 == n->pos)
n->pos--;
- set->elements--;
+ t->hregion[r].elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
- mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
- j);
+ mtype_del_cidr(set, h,
+ NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
#endif
ip_set_ext_destroy(set, data);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side del,
+ * save values
+ */
+ x = kzalloc(sizeof(struct mtype_resize_ad),
+ GFP_ATOMIC);
+ if (x) {
+ x->ad = IPSET_DEL;
+ memcpy(&x->d, value,
+ sizeof(struct mtype_elem));
+ x->flags = flags;
+ }
+ }
for (; i < n->pos; i++) {
if (!test_bit(i, n->used))
k++;
}
if (n->pos == 0 && k == 0) {
- set->ext_size -= ext_size(n->size, dsize);
+ t->hregion[r].ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
} else if (k >= AHASH_INIT_SIZE) {
@@ -902,7 +1119,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
tmp->pos = k;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, key), tmp);
kfree_rcu(n, rcu);
}
@@ -910,10 +1128,20 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
out:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (x) {
+ spin_lock_bh(&set->lock);
+ list_add(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by del: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
return ret;
}
-static inline int
+static int
mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
struct ip_set_ext *mext, struct ip_set *set, u32 flags)
{
@@ -957,7 +1185,7 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
#endif
key = HKEY(d, h->initval, t->htable_bits);
- n = rcu_dereference_bh(hbucket(t, key));
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n)
continue;
for (i = 0; i < n->pos; i++) {
@@ -995,6 +1223,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, ret = 0;
u32 key, multi = 0;
+ rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
#ifdef IP_SET_HASH_WITH_NETS
/* If we test an IP address and not a network address,
@@ -1026,6 +1255,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto out;
}
out:
+ rcu_read_unlock_bh();
return ret;
}
@@ -1037,49 +1267,57 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
const struct htable *t;
struct nlattr *nested;
size_t memsize;
+ u32 elements = 0;
+ size_t ext_size = 0;
u8 htable_bits;
- /* If any members have expired, set->elements will be wrong
- * mytype_expire function will update it with the right count.
- * we do not hold set->lock here, so grab it first.
- * set->elements can still be incorrect in the case of a huge set,
- * because elements might time out during the listing.
- */
- if (SET_WITH_TIMEOUT(set)) {
- spin_lock_bh(&set->lock);
- mtype_expire(set, h);
- spin_unlock_bh(&set->lock);
- }
-
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
- memsize = mtype_ahash_memsize(h, t) + set->ext_size;
+ t = rcu_dereference_bh(h->table);
+ mtype_ext_size(set, &elements, &ext_size);
+ memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
htable_bits = t->htable_bits;
rcu_read_unlock_bh();
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
htonl(jhash_size(htable_bits))) ||
nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_BITMASK
+ /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask
+ * and not bitmask. These two are mutually exclusive. */
+ if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) {
+ if (set->family == NFPROTO_IPV4) {
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip))
+ goto nla_put_failure;
+ } else if (set->family == NFPROTO_IPV6) {
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6))
+ goto nla_put_failure;
+ }
+ }
+#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- if (h->netmask != HOST_MASK &&
- nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
goto nla_put_failure;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure;
#endif
+ if (set->flags & IPSET_CREATE_FLAG_BUCKETSIZE) {
+ if (nla_put_u8(skb, IPSET_ATTR_BUCKETSIZE, h->bucketsize) ||
+ nla_put_net32(skb, IPSET_ATTR_INITVAL, htonl(h->initval)))
+ goto nla_put_failure;
+ }
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
- nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
return 0;
nla_put_failure:
@@ -1095,15 +1333,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
if (start) {
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
+ t = ipset_dereference_bh_nfnl(h->table);
atomic_inc(&t->uref);
cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
rcu_read_unlock_bh();
} else if (cb->args[IPSET_CB_PRIVATE]) {
t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
- /* Resizing didn't destroy the hash table */
- pr_debug("Table destroy by dump: %p\n", t);
+ pr_debug("Table destroy after resize "
+ " by dump: %p\n", t);
mtype_ahash_destroy(set, t, false);
}
cb->args[IPSET_CB_PRIVATE] = 0;
@@ -1124,7 +1362,7 @@ mtype_list(const struct ip_set *set,
void *incomplete;
int i, ret = 0;
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ atd = nla_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
@@ -1145,12 +1383,11 @@ mtype_list(const struct ip_set *set,
if (!test_bit(i, n->used))
continue;
e = ahash_data(n, i, set->dsize);
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ if (SET_ELEM_EXPIRED(set, e))
continue;
pr_debug("list hash %lu hbucket %p i %u, data %p\n",
cb->args[IPSET_CB_ARG0], n, i, e);
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
if (cb->args[IPSET_CB_ARG0] == first) {
nla_nest_cancel(skb, atd);
@@ -1163,10 +1400,10 @@ mtype_list(const struct ip_set *set,
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
}
}
- ipset_nest_end(skb, atd);
+ nla_nest_end(skb, atd);
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
@@ -1180,7 +1417,7 @@ nla_put_failure:
cb->args[IPSET_CB_ARG0] = 0;
ret = -EMSGSIZE;
} else {
- ipset_nest_end(skb, atd);
+ nla_nest_end(skb, atd);
}
out:
rcu_read_unlock();
@@ -1212,6 +1449,8 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
+ .region_lock = true,
};
#ifdef IP_SET_EMIT_CREATE
@@ -1224,12 +1463,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
u32 markmask;
#endif
u8 hbits;
-#ifdef IP_SET_HASH_WITH_NETMASK
- u8 netmask;
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ int ret __attribute__((unused)) = 0;
+ u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ union nf_inet_addr bitmask = onesmask;
#endif
size_t hsize;
struct htype *h;
struct htable *t;
+ u32 i;
pr_debug("Create set %s with family %s\n",
set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
@@ -1262,7 +1504,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
#ifdef IP_SET_HASH_WITH_NETMASK
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
if (tb[IPSET_ATTR_NETMASK]) {
netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
@@ -1270,6 +1511,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
(set->family == NFPROTO_IPV6 && netmask > 128) ||
netmask == 0)
return -IPSET_ERR_INVALID_NETMASK;
+
+ /* we convert netmask to bitmask and store it */
+ if (set->family == NFPROTO_IPV4)
+ bitmask.ip = ip_set_netmask(netmask);
+ else
+ ip6_netmask(&bitmask, netmask);
+ }
+#endif
+
+#ifdef IP_SET_HASH_WITH_BITMASK
+ if (tb[IPSET_ATTR_BITMASK]) {
+ /* bitmask and netmask do the same thing, allow only one of these options */
+ if (tb[IPSET_ATTR_NETMASK])
+ return -IPSET_ERR_BITMASK_NETMASK_EXCL;
+
+ if (set->family == NFPROTO_IPV4) {
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip);
+ if (ret || !bitmask.ip)
+ return -IPSET_ERR_INVALID_NETMASK;
+ } else if (set->family == NFPROTO_IPV6) {
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask);
+ if (ret || ipv6_addr_any(&bitmask.in6))
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+
+ if (nf_inet_addr_cmp(&bitmask, &zeromask))
+ return -IPSET_ERR_INVALID_NETMASK;
}
#endif
@@ -1287,7 +1555,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
if (!h)
return -ENOMEM;
- hbits = htable_bits(hashsize);
+ /* Compute htable_bits from the user input parameter hashsize.
+ * Assume that hashsize == 2^htable_bits,
+ * otherwise round up to the first 2^n value.
+ */
+ hbits = fls(hashsize - 1);
hsize = htable_size(hbits);
if (hsize == 0) {
kfree(h);
@@ -1298,19 +1570,42 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
kfree(h);
return -ENOMEM;
}
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
+ if (!t->hregion) {
+ ip_set_free(t);
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->gc.set = set;
+ for (i = 0; i < ahash_numof_locks(hbits); i++)
+ spin_lock_init(&t->hregion[i].lock);
h->maxelem = maxelem;
-#ifdef IP_SET_HASH_WITH_NETMASK
+#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK)
+ h->bitmask = bitmask;
h->netmask = netmask;
#endif
#ifdef IP_SET_HASH_WITH_MARKMASK
h->markmask = markmask;
#endif
- get_random_bytes(&h->initval, sizeof(h->initval));
-
+ if (tb[IPSET_ATTR_INITVAL])
+ h->initval = ntohl(nla_get_be32(tb[IPSET_ATTR_INITVAL]));
+ else
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->bucketsize = AHASH_MAX_SIZE;
+ if (tb[IPSET_ATTR_BUCKETSIZE]) {
+ h->bucketsize = nla_get_u8(tb[IPSET_ATTR_BUCKETSIZE]);
+ if (h->bucketsize < AHASH_INIT_SIZE)
+ h->bucketsize = AHASH_INIT_SIZE;
+ else if (h->bucketsize > AHASH_MAX_SIZE)
+ h->bucketsize = AHASH_MAX_SIZE;
+ else if (h->bucketsize % 2)
+ h->bucketsize += 1;
+ }
t->htable_bits = hbits;
+ t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
- h->set = set;
+ INIT_LIST_HEAD(&h->ad);
set->data = h;
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4) {
@@ -1333,12 +1628,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4)
#endif
- IPSET_TOKEN(HTYPE, 4_gc_init)(set,
- IPSET_TOKEN(HTYPE, 4_gc));
+ IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
#ifndef IP_SET_PROTO_UNDEF
else
- IPSET_TOKEN(HTYPE, 6_gc_init)(set,
- IPSET_TOKEN(HTYPE, 6_gc));
+ IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
#endif
}
pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 613eb212cb48..c9f4e3859663 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip type */
@@ -27,16 +23,19 @@
/* 1 Counters support */
/* 2 Comments support */
/* 3 Forceadd support */
-#define IPSET_TYPE_REV_MAX 4 /* skbinfo support */
+/* 4 skbinfo support */
+/* 5 bucketsize, initval support */
+#define IPSET_TYPE_REV_MAX 6 /* bitmask support */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip");
/* Type specific function prefix */
#define HTYPE hash_ip
#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -48,7 +47,7 @@ struct hash_ip4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ip4_data_equal(const struct hash_ip4_elem *e1,
const struct hash_ip4_elem *e2,
u32 *multi)
@@ -67,7 +66,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
{
next->ip = e->ip;
@@ -89,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
__be32 ip;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
- ip &= ip_set_netmask(h->netmask);
+ ip &= h->bitmask.ip;
if (ip == 0)
return -EINVAL;
@@ -101,11 +100,11 @@ static int
hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ip4 *h = set->data;
+ struct hash_ip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ip4_elem e = { 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0, hosts;
+ u32 ip = 0, ip_to = 0, hosts, i = 0;
int ret = 0;
if (tb[IPSET_ATTR_LINENO])
@@ -122,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- ip &= ip_set_hostmask(h->netmask);
+ ip &= ntohl(h->bitmask.ip);
e.ip = htonl(ip);
if (e.ip == 0)
return -IPSET_ERR_HASH_ELEM;
@@ -135,8 +134,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -147,18 +149,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
- if (retried) {
+ if (retried)
ip = ntohl(h->next.ip);
+ for (; ip <= ip_to; i++) {
e.ip = htonl(ip);
- }
- for (; ip <= ip_to;) {
+ if (i > IPSET_MAX_RANGE) {
+ hash_ip4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
ip += hosts;
- e.ip = htonl(ip);
- if (e.ip == 0)
+ if (ip == 0)
return 0;
ret = 0;
@@ -175,7 +179,7 @@ struct hash_ip6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
const struct hash_ip6_elem *ip2,
u32 *multi)
@@ -183,12 +187,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
}
-static inline void
-hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
- ip6_netmask(ip, prefix);
-}
-
static bool
hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
@@ -200,7 +198,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ip6_data_next(struct hash_ip6_elem *next, const struct hash_ip6_elem *e)
{
}
@@ -225,7 +223,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -EINVAL;
@@ -264,7 +262,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
- hash_ip6_netmask(&e.ip, h->netmask);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
if (ipv6_addr_any(&e.ip.in6))
return -IPSET_ERR_HASH_ELEM;
@@ -281,14 +279,17 @@ static struct ip_set_type hash_ip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c
index c830c68142ff..467c59a83c0a 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmac.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmac.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* Kernel module implementing an IP set type: the hash:ip,mac type */
@@ -26,7 +23,7 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>");
@@ -50,7 +47,7 @@ struct hash_ipmac4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1,
const struct hash_ipmac4_elem *e2,
u32 *multi)
@@ -70,7 +67,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmac4_data_next(struct hash_ipmac4_elem *next,
const struct hash_ipmac4_elem *e)
{
@@ -92,15 +89,11 @@ hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb,
struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- /* MAC can be src only */
- if (!(opt->flags & IPSET_DIM_TWO_SRC))
- return 0;
-
if (skb_mac_header(skb) < skb->head ||
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- if (opt->flags & IPSET_DIM_ONE_SRC)
+ if (opt->flags & IPSET_DIM_TWO_SRC)
ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
else
ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
@@ -161,7 +154,7 @@ struct hash_ipmac6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1,
const struct hash_ipmac6_elem *e2,
u32 *multi)
@@ -182,7 +175,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmac6_data_next(struct hash_ipmac6_elem *next,
const struct hash_ipmac6_elem *e)
{
@@ -216,7 +209,7 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb,
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- if (opt->flags & IPSET_DIM_ONE_SRC)
+ if (opt->flags & IPSET_DIM_TWO_SRC)
ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
else
ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
@@ -275,11 +268,13 @@ static struct ip_set_type hash_ipmac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index f3ba8348cf9d..a22ec1a6f6ec 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -1,10 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- * Copyright (C) 2013 Smoothwall Ltd. <vytas.dauksa@smoothwall.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,mark type */
@@ -26,7 +21,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support */
+/* 2 skbinfo support */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>");
@@ -47,7 +43,7 @@ struct hash_ipmark4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1,
const struct hash_ipmark4_elem *ip2,
u32 *multi)
@@ -69,7 +65,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmark4_data_next(struct hash_ipmark4_elem *next,
const struct hash_ipmark4_elem *d)
{
@@ -101,11 +97,11 @@ static int
hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipmark4 *h = set->data;
+ struct hash_ipmark4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipmark4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0;
+ u32 ip, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -125,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
e.mark &= h->markmask;
+ if (e.mark == 0 && e.ip == 0)
+ return -IPSET_ERR_HASH_ELEM;
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
@@ -137,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
return ret;
- if (ip > ip_to)
+ if (ip > ip_to) {
+ if (e.mark == 0 && ip_to == 0)
+ return -IPSET_ERR_HASH_ELEM;
swap(ip, ip_to);
+ }
} else if (tb[IPSET_ATTR_CIDR]) {
u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
@@ -149,8 +150,12 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
- for (; ip <= ip_to; ip++) {
+ for (; ip <= ip_to; ip++, i++) {
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipmark4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -170,7 +175,7 @@ struct hash_ipmark6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1,
const struct hash_ipmark6_elem *ip2,
u32 *multi)
@@ -192,7 +197,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipmark6_data_next(struct hash_ipmark6_elem *next,
const struct hash_ipmark6_elem *d)
{
@@ -279,12 +284,14 @@ static struct ip_set_type hash_ipmark_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipmark_create,
.create_policy = {
[IPSET_ATTR_MARKMASK] = { .type = NLA_U32 },
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index ddb8039ec1d2..e977b5a9c48d 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port type */
@@ -29,15 +25,19 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+/* 6 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port");
/* Type specific function prefix */
#define HTYPE hash_ipport
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
/* IPv4 variant */
@@ -51,7 +51,7 @@ struct hash_ipport4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
const struct hash_ipport4_elem *ip2,
u32 *multi)
@@ -75,7 +75,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipport4_data_next(struct hash_ipport4_elem *next,
const struct hash_ipport4_elem *d)
{
@@ -95,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -108,11 +112,11 @@ static int
hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipport4 *h = set->data;
+ struct hash_ipport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0, p = 0, port, port_to;
+ u32 ip, ip_to = 0, p = 0, port, port_to, i = 0;
bool with_ports = false;
int ret;
@@ -132,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ e.ip &= h->bitmask.ip;
+ if (e.ip == 0)
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -181,9 +189,13 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; ip <= ip_to; ip++) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.ip = htonl(ip);
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipport4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -206,7 +218,7 @@ struct hash_ipport6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
const struct hash_ipport6_elem *ip2,
u32 *multi)
@@ -230,7 +242,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipport6_data_next(struct hash_ipport6_elem *next,
const struct hash_ipport6_elem *d)
{
@@ -253,12 +265,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipport6_elem e = { .ip = { .all = { 0 } } };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
+ const struct MTYPE *h = set->data;
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
&e.port, &e.proto))
return -EINVAL;
ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -298,6 +315,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (ret)
return ret;
+ nf_inet_addr_mask_inplace(&e.ip, &h->bitmask);
+ if (ipv6_addr_any(&e.ip.in6))
+ return -EINVAL;
+
e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
if (tb[IPSET_ATTR_PROTO]) {
@@ -345,15 +366,19 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index a7f4d7a85420..39a01934b153 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,ip type */
@@ -29,10 +25,11 @@
/* 2 Counters support added */
/* 3 Comments support added */
/* 4 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */
+/* 5 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,ip");
@@ -50,7 +47,7 @@ struct hash_ipportip4_elem {
u8 padding;
};
-static inline bool
+static bool
hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
const struct hash_ipportip4_elem *ip2,
u32 *multi)
@@ -76,7 +73,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
const struct hash_ipportip4_elem *d)
{
@@ -111,11 +108,11 @@ static int
hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportip4 *h = set->data;
+ struct hash_ipportip4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportip4_elem e = { .ip = 0 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip, ip_to = 0, p = 0, port, port_to;
+ u32 ip, ip_to = 0, p = 0, port, port_to, i = 0;
bool with_ports = false;
int ret;
@@ -188,9 +185,13 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; ip <= ip_to; ip++) {
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.ip = htonl(ip);
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipportip4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -214,7 +215,7 @@ struct hash_ipportip6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
const struct hash_ipportip6_elem *ip2,
u32 *multi)
@@ -240,7 +241,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportip6_data_next(struct hash_ipportip6_elem *next,
const struct hash_ipportip6_elem *d)
{
@@ -360,11 +361,13 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportip_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 88b83d6d3084..5c6de605a9fb 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
@@ -31,10 +27,11 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,net");
@@ -63,7 +60,7 @@ struct hash_ipportnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
const struct hash_ipportnet4_elem *ip2,
u32 *multi)
@@ -75,25 +72,25 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
{
elem->ip2 &= ip_set_netmask(cidr);
@@ -120,7 +117,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
const struct hash_ipportnet4_elem *d)
{
@@ -163,12 +160,12 @@ static int
hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_ipportnet4 *h = set->data;
+ struct hash_ipportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, p = 0, port, port_to;
- u32 ip2_from = 0, ip2_to = 0, ip2;
+ u32 ip2_from = 0, ip2_to = 0, ip2, i = 0;
bool with_ports = false;
u8 cidr;
int ret;
@@ -282,9 +279,15 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
for (; p <= port_to; p++) {
e.port = htons(p);
do {
+ i++;
e.ip2 = htonl(ip2);
ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr);
e.cidr = cidr - 1;
+ if (i > IPSET_MAX_RANGE) {
+ hash_ipportnet4_data_next(&h->next,
+ &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -312,7 +315,7 @@ struct hash_ipportnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
const struct hash_ipportnet6_elem *ip2,
u32 *multi)
@@ -324,25 +327,25 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip2, cidr);
@@ -369,7 +372,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next,
const struct hash_ipportnet6_elem *d)
{
@@ -517,11 +520,13 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_ipportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 4fe5f243d0a3..718814730acf 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2014 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:mac type */
@@ -20,10 +16,10 @@
#include <linux/netfilter/ipset/ip_set_hash.h>
#define IPSET_TYPE_REV_MIN 0
-#define IPSET_TYPE_REV_MAX 0
+#define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:mac");
@@ -41,7 +37,7 @@ struct hash_mac4_elem {
/* Common functions */
-static inline bool
+static bool
hash_mac4_data_equal(const struct hash_mac4_elem *e1,
const struct hash_mac4_elem *e2,
u32 *multi)
@@ -49,7 +45,7 @@ hash_mac4_data_equal(const struct hash_mac4_elem *e1,
return ether_addr_equal(e1->ether, e2->ether);
}
-static inline bool
+static bool
hash_mac4_data_list(struct sk_buff *skb, const struct hash_mac4_elem *e)
{
if (nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether))
@@ -60,7 +56,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_mac4_data_next(struct hash_mac4_elem *next,
const struct hash_mac4_elem *e)
{
@@ -129,11 +125,13 @@ static struct ip_set_type hash_mac_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_mac_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 5449e23af13a..ce0a9ce5a91f 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net type */
@@ -28,10 +24,11 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */
+/* 6 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 7 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net");
@@ -51,7 +48,7 @@ struct hash_net4_elem {
/* Common functions */
-static inline bool
+static bool
hash_net4_data_equal(const struct hash_net4_elem *ip1,
const struct hash_net4_elem *ip2,
u32 *multi)
@@ -60,25 +57,25 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_net4_do_data_match(const struct hash_net4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -101,7 +98,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_net4_data_next(struct hash_net4_elem *next,
const struct hash_net4_elem *d)
{
@@ -139,11 +136,11 @@ static int
hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_net4 *h = set->data;
+ struct hash_net4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_net4_elem e = { .cidr = HOST_MASK };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -191,10 +188,16 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
}
+
if (retried)
ip = ntohl(h->next.ip);
do {
+ i++;
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_net4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -216,7 +219,7 @@ struct hash_net6_elem {
/* Common functions */
-static inline bool
+static bool
hash_net6_data_equal(const struct hash_net6_elem *ip1,
const struct hash_net6_elem *ip2,
u32 *multi)
@@ -225,25 +228,25 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_net6_do_data_match(const struct hash_net6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -266,7 +269,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_net6_data_next(struct hash_net6_elem *next,
const struct hash_net6_elem *d)
{
@@ -358,11 +361,13 @@ static struct ip_set_type hash_net_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_net_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
@@ -372,6 +377,7 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f5164c1efce2..30a655e5c4fd 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,iface type */
@@ -29,10 +25,12 @@
/* 3 Counters support added */
/* 4 Comments support added */
/* 5 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */
+/* 6 skbinfo support added */
+/* 7 interface wildcard support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,iface");
@@ -42,7 +40,7 @@ MODULE_ALIAS("ip_set_hash:net,iface");
#define IP_SET_HASH_WITH_MULTI
#define IP_SET_HASH_WITH_NET0
-#define STRLCPY(a, b) strlcpy(a, b, IFNAMSIZ)
+#define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ)
/* IPv4 variant */
@@ -61,12 +59,13 @@ struct hash_netiface4_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
/* Common functions */
-static inline bool
+static bool
hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
const struct hash_netiface4_elem *ip2,
u32 *multi)
@@ -75,28 +74,30 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
-static inline int
+static int
hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -107,7 +108,8 @@ static bool
hash_netiface4_data_list(struct sk_buff *skb,
const struct hash_netiface4_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -123,7 +125,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netiface4_data_next(struct hash_netiface4_elem *next,
const struct hash_netiface4_elem *d)
{
@@ -136,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
#include "ip_set_hash_gen.h"
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static const char *get_physindev_name(const struct sk_buff *skb)
+static const char *get_physindev_name(const struct sk_buff *skb, struct net *net)
{
- struct net_device *dev = nf_bridge_get_physindev(skb);
+ struct net_device *dev = nf_bridge_get_physindev(skb, net);
return dev ? dev->name : NULL;
}
@@ -175,16 +177,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
@@ -200,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 ip = 0, ip_to = 0;
+ u32 ip = 0, ip_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -224,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -233,6 +235,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
e.ip = htonl(ip & ip_set_hostmask(e.cidr));
@@ -256,7 +260,12 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
do {
+ i++;
e.ip = htonl(ip);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netiface4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
ret = adtfn(set, &e, &ext, &ext, flags);
@@ -284,12 +293,13 @@ struct hash_netiface6_elem {
u8 cidr;
u8 nomatch;
u8 elem;
+ u8 wildcard;
char iface[IFNAMSIZ];
};
/* Common functions */
-static inline bool
+static bool
hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
const struct hash_netiface6_elem *ip2,
u32 *multi)
@@ -298,28 +308,30 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->cidr == ip2->cidr &&
(++*multi) &&
ip1->physdev == ip2->physdev &&
- strcmp(ip1->iface, ip2->iface) == 0;
+ (ip1->wildcard ?
+ strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+ strcmp(ip1->iface, ip2->iface) == 0);
}
-static inline int
+static int
hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -330,7 +342,8 @@ static bool
hash_netiface6_data_list(struct sk_buff *skb,
const struct hash_netiface6_elem *data)
{
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+ u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+ (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
if (data->nomatch)
flags |= IPSET_FLAG_NOMATCH;
@@ -346,7 +359,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netiface6_data_next(struct hash_netiface6_elem *next,
const struct hash_netiface6_elem *d)
{
@@ -382,16 +395,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
return -EINVAL;
- STRLCPY(e.iface, eiface);
+ STRSCPY(e.iface, eiface);
e.physdev = 1;
#endif
} else {
- STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+ STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
}
if (strlen(e.iface) == 0)
@@ -435,7 +448,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip, e.cidr);
- nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
+ nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ);
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -444,6 +457,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
e.physdev = 1;
if (cadt_flags & IPSET_FLAG_NOMATCH)
flags |= (IPSET_FLAG_NOMATCH << 16);
+ if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+ e.wildcard = 1;
}
ret = adtfn(set, &e, &ext, &ext, flags);
@@ -461,11 +476,13 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netiface_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 5a2b923bd81f..8fbe649c9dd3 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -1,9 +1,6 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* Copyright (C) 2013 Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/* Kernel module implementing an IP set type: the hash:net type */
@@ -25,7 +22,9 @@
#define IPSET_TYPE_REV_MIN 0
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+/* 3 bucketsize, initval support added */
+#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -35,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net");
/* Type specific function prefix */
#define HTYPE hash_netnet
#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_NETMASK
+#define IP_SET_HASH_WITH_BITMASK
#define IPSET_NET_COUNT 2
/* IPv4 variants */
@@ -55,7 +56,7 @@ struct hash_netnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
const struct hash_netnet4_elem *ip2,
u32 *multi)
@@ -64,32 +65,32 @@ hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
ip1->ccmp == ip2->ccmp;
}
-static inline int
+static int
hash_netnet4_do_data_match(const struct hash_netnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netnet4_data_set_flags(struct hash_netnet4_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
struct hash_netnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
{
if (inner) {
@@ -120,7 +121,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netnet4_data_next(struct hash_netnet4_elem *next,
const struct hash_netnet4_elem *d)
{
@@ -155,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]);
ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]);
- e.ip[0] &= ip_set_netmask(e.cidr[0]);
- e.ip[1] &= ip_set_netmask(e.cidr[1]);
+ e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip);
+ e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -165,12 +166,12 @@ static int
hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netnet4 *h = set->data;
+ struct hash_netnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0;
- u32 ip2 = 0, ip2_from = 0, ip2_to = 0;
+ u32 ip2 = 0, ip2_from = 0, ip2_to = 0, i = 0;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -214,8 +215,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_IP2_TO])) {
- e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0]));
- e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1]));
+ e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0]));
+ e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1]));
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_enomatch(ret, flags, adt, set) ? -ret :
ip_set_eexist(ret, flags) ? 0 : ret;
@@ -258,7 +259,12 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
e.ip[0] = htonl(ip);
ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
do {
+ i++;
e.ip[1] = htonl(ip2);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netnet4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ip2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
@@ -285,7 +291,7 @@ struct hash_netnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
const struct hash_netnet6_elem *ip2,
u32 *multi)
@@ -295,32 +301,32 @@ hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
ip1->ccmp == ip2->ccmp;
}
-static inline int
+static int
hash_netnet6_do_data_match(const struct hash_netnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netnet6_data_set_flags(struct hash_netnet6_elem *elem, u32 flags)
{
elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
-static inline void
+static void
hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
struct hash_netnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
{
if (inner) {
@@ -351,7 +357,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netnet6_data_next(struct hash_netnet6_elem *next,
const struct hash_netnet6_elem *d)
{
@@ -392,6 +398,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -EINVAL;
+
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -402,6 +413,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netnet6_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
+ const struct hash_netnet6 *h = set->data;
int ret;
if (tb[IPSET_ATTR_LINENO])
@@ -441,6 +453,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
ip6_netmask(&e.ip[0], e.cidr[0]);
ip6_netmask(&e.ip[1], e.cidr[1]);
+ nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask);
+ nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask);
+ if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6))
+ return -IPSET_ERR_HASH_ELEM;
+
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
@@ -462,14 +479,18 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -479,6 +500,7 @@ static struct ip_set_type hash_netnet_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_CIDR2] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_BYTES] = { .type = NLA_U64 },
[IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 1a187be9ebc8..d1a0628df4ef 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:net,port type */
@@ -30,10 +26,11 @@
/* 4 Counters support added */
/* 5 Comments support added */
/* 6 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */
+/* 7 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_hash:net,port");
@@ -61,7 +58,7 @@ struct hash_netport4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
const struct hash_netport4_elem *ip2,
u32 *multi)
@@ -72,25 +69,25 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
{
elem->ip &= ip_set_netmask(cidr);
@@ -116,7 +113,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netport4_data_next(struct hash_netport4_elem *next,
const struct hash_netport4_elem *d)
{
@@ -157,11 +154,11 @@ static int
hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netport4 *h = set->data;
+ struct hash_netport4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
- u32 port, port_to, p = 0, ip = 0, ip_to = 0;
+ u32 port, port_to, p = 0, ip = 0, ip_to = 0, i = 0;
bool with_ports = false;
u8 cidr;
int ret;
@@ -249,8 +246,12 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
e.ip = htonl(ip);
ip = ip_set_range_to_cidr(ip, ip_to, &cidr);
e.cidr = cidr - 1;
- for (; p <= port_to; p++) {
+ for (; p <= port_to; p++, i++) {
e.port = htons(p);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netport4_data_next(&h->next, &e);
+ return -ERANGE;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -274,7 +275,7 @@ struct hash_netport6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
const struct hash_netport6_elem *ip2,
u32 *multi)
@@ -285,25 +286,25 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline int
+static int
hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
{
ip6_netmask(&elem->ip, cidr);
@@ -329,7 +330,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netport6_data_next(struct hash_netport6_elem *next,
const struct hash_netport6_elem *d)
{
@@ -464,11 +465,13 @@ static struct ip_set_type hash_netport_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netport_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 613e18e720a4..bf4f91b78e1d 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the hash:ip,port,net type */
@@ -27,7 +23,8 @@
#define IPSET_TYPE_REV_MIN 0
/* 0 Comments support added */
/* 1 Forceadd support added */
-#define IPSET_TYPE_REV_MAX 2 /* skbinfo support added */
+/* 2 skbinfo support added */
+#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>");
@@ -39,6 +36,7 @@ MODULE_ALIAS("ip_set_hash:net,port,net");
#define IP_SET_HASH_WITH_PROTO
#define IP_SET_HASH_WITH_NETS
#define IPSET_NET_COUNT 2
+#define IP_SET_HASH_WITH_NET0
/* IPv4 variant */
@@ -60,7 +58,7 @@ struct hash_netportnet4_elem {
/* Common functions */
-static inline bool
+static bool
hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
const struct hash_netportnet4_elem *ip2,
u32 *multi)
@@ -71,32 +69,32 @@ hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
struct hash_netportnet4_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
u8 cidr, bool inner)
{
@@ -130,7 +128,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
const struct hash_netportnet4_elem *d)
{
@@ -176,16 +174,26 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
+static u32
+hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr)
+{
+ if (from == 0 && to == UINT_MAX) {
+ *cidr = 0;
+ return to;
+ }
+ return ip_set_range_to_cidr(from, to, cidr);
+}
+
static int
hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct hash_netportnet4 *h = set->data;
+ struct hash_netportnet4 *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
struct hash_netportnet4_elem e = { };
struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
u32 ip = 0, ip_to = 0, p = 0, port, port_to;
- u32 ip2_from = 0, ip2_to = 0, ip2;
+ u32 ip2_from = 0, ip2_to = 0, ip2, i = 0;
bool with_ports = false;
int ret;
@@ -299,13 +307,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
do {
e.ip[0] = htonl(ip);
- ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
+ ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]);
for (; p <= port_to; p++) {
e.port = htons(p);
do {
+ i++;
e.ip[1] = htonl(ip2);
- ip2 = ip_set_range_to_cidr(ip2, ip2_to,
- &e.cidr[1]);
+ if (i > IPSET_MAX_RANGE) {
+ hash_netportnet4_data_next(&h->next,
+ &e);
+ return -ERANGE;
+ }
+ ip2 = hash_netportnet4_range_to_cidr(ip2,
+ ip2_to, &e.cidr[1]);
ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -335,7 +349,7 @@ struct hash_netportnet6_elem {
/* Common functions */
-static inline bool
+static bool
hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
const struct hash_netportnet6_elem *ip2,
u32 *multi)
@@ -347,32 +361,32 @@ hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline int
+static int
hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
-static inline void
+static void
hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags)
{
elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
-static inline void
+static void
hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
{
swap(*flags, elem->nomatch);
}
-static inline void
+static void
hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
struct hash_netportnet6_elem *orig)
{
elem->ip[1] = orig->ip[1];
}
-static inline void
+static void
hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
u8 cidr, bool inner)
{
@@ -406,7 +420,7 @@ nla_put_failure:
return true;
}
-static inline void
+static void
hash_netportnet6_data_next(struct hash_netportnet6_elem *next,
const struct hash_netportnet6_elem *d)
{
@@ -562,11 +576,13 @@ static struct ip_set_type hash_netportnet_type __read_mostly = {
.family = NFPROTO_UNSPEC,
.revision_min = IPSET_TYPE_REV_MIN,
.revision_max = IPSET_TYPE_REV_MAX,
+ .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE,
.create = hash_netportnet_create,
.create_policy = {
[IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 },
[IPSET_ATTR_MAXELEM] = { .type = NLA_U32 },
- [IPSET_ATTR_PROBES] = { .type = NLA_U8 },
+ [IPSET_ATTR_INITVAL] = { .type = NLA_U32 },
+ [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 8da228da53ae..13c7a08aa868 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,9 +1,5 @@
-/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */
/* Kernel module implementing an IP set type: the list:set type */
@@ -22,7 +18,7 @@
#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX);
MODULE_ALIAS("ip_set_list:set");
@@ -63,7 +59,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
/* Don't lookup sub-counters at all */
opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
- opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE;
list_for_each_entry_rcu(e, &map->members, list) {
ret = ip_set_test(e->id, skb, par, opt);
if (ret <= 0)
@@ -83,7 +79,7 @@ list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -103,7 +99,7 @@ list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
struct set_elem *e;
int ret;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -153,7 +149,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
kfree(e);
}
-static inline void
+static void
list_set_del(struct ip_set *set, struct set_elem *e)
{
struct list_set *map = set->data;
@@ -164,7 +160,7 @@ list_set_del(struct ip_set *set, struct set_elem *e)
call_rcu(&e->rcu, __list_set_del_rcu);
}
-static inline void
+static void
list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
{
struct list_set *map = set->data;
@@ -192,9 +188,10 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct list_set *map = set->data;
struct set_adt_elem *d = value;
struct set_elem *e, *next, *prev = NULL;
- int ret;
+ int ret = 0;
- list_for_each_entry(e, &map->members, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -205,6 +202,7 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (d->before == 0) {
ret = 1;
+ goto out;
} else if (d->before > 0) {
next = list_next_entry(e, list);
ret = !list_is_last(&e->list, &map->members) &&
@@ -212,9 +210,11 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
} else {
ret = prev && prev->id == d->refid;
}
- return ret;
+ goto out;
}
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
static void
@@ -243,7 +243,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
/* Find where to add the new entry */
n = prev = next = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_rcu(e, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -292,7 +292,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (n &&
!(SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(n, set))))
- n = NULL;
+ n = NULL;
e = kzalloc(set->dsize, GFP_ATOMIC);
if (!e)
@@ -320,9 +320,9 @@ list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
{
struct list_set *map = set->data;
struct set_adt_elem *d = value;
- struct set_elem *e, *next, *prev = NULL;
+ struct set_elem *e, *n, *next, *prev = NULL;
- list_for_each_entry(e, &map->members, list) {
+ list_for_each_entry_safe(e, n, &map->members, list) {
if (SET_WITH_TIMEOUT(set) &&
ip_set_timeout_expired(ext_timeout(e, set)))
continue;
@@ -428,17 +428,8 @@ static void
list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *e, *n;
-
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
- list_for_each_entry_safe(e, n, &map->members, list) {
- list_del(&e->list);
- ip_set_put_byindex(map->net, e->id);
- ip_set_ext_destroy(set, e);
- kfree(e);
- }
+ WARN_ON_ONCE(!list_empty(&map->members));
kfree(map);
set->data = NULL;
@@ -466,7 +457,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
struct nlattr *nested;
size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
@@ -476,7 +467,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
return 0;
nla_put_failure:
@@ -494,7 +485,7 @@ list_set_list(const struct ip_set *set,
struct set_elem *e;
int ret = 0;
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ atd = nla_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
@@ -506,7 +497,7 @@ list_set_list(const struct ip_set *set,
i++;
continue;
}
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ nested = nla_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
ip_set_name_byindex(map->net, e->id, name);
@@ -514,11 +505,11 @@ list_set_list(const struct ip_set *set,
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
- ipset_nest_end(skb, nested);
+ nla_nest_end(skb, nested);
i++;
}
- ipset_nest_end(skb, atd);
+ nla_nest_end(skb, atd);
/* Set listing finished */
cb->args[IPSET_CB_ARG0] = 0;
goto out;
@@ -531,7 +522,7 @@ nla_put_failure:
ret = -EMSGSIZE;
} else {
cb->args[IPSET_CB_ARG0] = i;
- ipset_nest_end(skb, atd);
+ nla_nest_end(skb, atd);
}
out:
rcu_read_unlock();
@@ -549,6 +540,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_shutdown_sync(&map->gc);
+
+ /* Flush list to drop references to other ipsets */
+ list_set_flush(set);
+}
+
static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
@@ -562,12 +565,13 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head,
.list = list_set_list,
.same_set = list_set_same_set,
+ .cancel_gc = list_set_cancel_gc,
};
static void
list_set_gc(struct timer_list *t)
{
- struct list_set *map = from_timer(map, t, gc);
+ struct list_set *map = timer_container_of(map, t, gc);
struct ip_set *set = map->set;
spin_lock_bh(&set->lock);
@@ -607,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size)
return true;
}
+static struct lock_class_key list_set_lockdep_key;
+
static int
list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
u32 flags)
@@ -623,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (size < IP_SET_LIST_MIN_SIZE)
size = IP_SET_LIST_MIN_SIZE;
+ lockdep_set_class(&set->lock, &list_set_lockdep_key);
set->variant = &set_variant;
set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
__alignof__(struct set_elem));
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
index d5be9c25fad6..ff570bff9221 100644
--- a/net/netfilter/ipset/pfxlen.c
+++ b/net/netfilter/ipset/pfxlen.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/netfilter/ipset/pfxlen.h>
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index cad48d07c818..c203252e856d 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -1,11 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP Virtual Server configuration
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER
+ depends on INET && NETFILTER
depends on (NF_CONNTRACK || NF_CONNTRACK=n)
- ---help---
+ help
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
option must be enabled for at least one of the clustered computers
@@ -28,24 +29,25 @@ if IP_VS
config IP_VS_IPV6
bool "IPv6 support for IPVS"
depends on IPV6 = y || IP_VS = IPV6
- select IP6_NF_IPTABLES
- ---help---
+ select NF_DEFRAG_IPV6
+ help
Add IPv6 support to IPVS.
Say Y if unsure.
config IP_VS_DEBUG
bool "IP virtual server debugging"
- ---help---
+ help
Say Y here if you want to get additional messages useful in
debugging the IP virtual server code. You can change the debug
level in /proc/sys/net/ipv4/vs/debug_level
config IP_VS_TAB_BITS
int "IPVS connection table size (the Nth power of 2)"
- range 8 20
+ range 8 20 if !64BIT
+ range 8 27 if 64BIT
default 12
- ---help---
+ help
The IPVS connection hash table uses the chaining scheme to handle
hash collisions. Using a big IPVS connection hash table will greatly
reduce conflicts when there are hundreds of thousands of connections
@@ -53,36 +55,36 @@ config IP_VS_TAB_BITS
Note the table size must be power of 2. The table size will be the
value of 2 to the your input number power. The number to choose is
- from 8 to 20, the default number is 12, which means the table size
- is 4096. Don't input the number too small, otherwise you will lose
- performance on it. You can adapt the table size yourself, according
- to your virtual server application. It is good to set the table size
- not far less than the number of connections per second multiplying
- average lasting time of connection in the table. For example, your
- virtual server gets 200 connections per second, the connection lasts
- for 200 seconds in average in the connection table, the table size
- should be not far less than 200x200, it is good to set the table
- size 32768 (2**15).
+ from 8 to 27 for 64BIT(20 otherwise), the default number is 12,
+ which means the table size is 4096. Don't input the number too
+ small, otherwise you will lose performance on it. You can adapt the
+ table size yourself, according to your virtual server application.
+ It is good to set the table size not far less than the number of
+ connections per second multiplying average lasting time of
+ connection in the table. For example, your virtual server gets 200
+ connections per second, the connection lasts for 200 seconds in
+ average in the connection table, the table size should be not far
+ less than 200x200, it is good to set the table size 32768 (2**15).
Another note that each connection occupies 128 bytes effectively and
each hash entry uses 8 bytes, so you can estimate how much memory is
needed for your box.
You can overwrite this number setting conn_tab_bits module parameter
- or by appending ip_vs.conn_tab_bits=? to the kernel command line
- if IP VS was compiled built-in.
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line if
+ IP VS was compiled built-in.
comment "IPVS transport protocol load balancing support"
config IP_VS_PROTO_TCP
bool "TCP load balancing support"
- ---help---
+ help
This option enables support for load balancing TCP transport
protocol. Say Y if unsure.
config IP_VS_PROTO_UDP
bool "UDP load balancing support"
- ---help---
+ help
This option enables support for load balancing UDP transport
protocol. Say Y if unsure.
@@ -91,20 +93,20 @@ config IP_VS_PROTO_AH_ESP
config IP_VS_PROTO_ESP
bool "ESP load balancing support"
- ---help---
+ help
This option enables support for load balancing ESP (Encapsulation
Security Payload) transport protocol. Say Y if unsure.
config IP_VS_PROTO_AH
bool "AH load balancing support"
- ---help---
+ help
This option enables support for load balancing AH (Authentication
Header) transport protocol. Say Y if unsure.
config IP_VS_PROTO_SCTP
bool "SCTP load balancing support"
- select LIBCRC32C
- ---help---
+ select NET_CRC32C
+ help
This option enables support for load balancing SCTP transport
protocol. Say Y if unsure.
@@ -112,7 +114,7 @@ comment "IPVS scheduler"
config IP_VS_RR
tristate "round-robin scheduling"
- ---help---
+ help
The robin-robin scheduling algorithm simply directs network
connections to different real servers in a round-robin manner.
@@ -121,7 +123,7 @@ config IP_VS_RR
config IP_VS_WRR
tristate "weighted round-robin scheduling"
- ---help---
+ help
The weighted robin-robin scheduling algorithm directs network
connections to different real servers based on server weights
in a round-robin manner. Servers with higher weights receive
@@ -133,8 +135,8 @@ config IP_VS_WRR
module, choose M here. If unsure, say N.
config IP_VS_LC
- tristate "least-connection scheduling"
- ---help---
+ tristate "least-connection scheduling"
+ help
The least-connection scheduling algorithm directs network
connections to the server with the least number of active
connections.
@@ -143,8 +145,8 @@ config IP_VS_LC
module, choose M here. If unsure, say N.
config IP_VS_WLC
- tristate "weighted least-connection scheduling"
- ---help---
+ tristate "weighted least-connection scheduling"
+ help
The weighted least-connection scheduling algorithm directs network
connections to the server with the least active connections
normalized by the server weight.
@@ -154,7 +156,7 @@ config IP_VS_WLC
config IP_VS_FO
tristate "weighted failover scheduling"
- ---help---
+ help
The weighted failover scheduling algorithm directs network
connections to the server with the highest weight that is
currently available.
@@ -164,7 +166,7 @@ config IP_VS_FO
config IP_VS_OVF
tristate "weighted overflow scheduling"
- ---help---
+ help
The weighted overflow scheduling algorithm directs network
connections to the server with the highest weight that is
currently available and overflows to the next when active
@@ -175,7 +177,7 @@ config IP_VS_OVF
config IP_VS_LBLC
tristate "locality-based least-connection scheduling"
- ---help---
+ help
The locality-based least-connection scheduling algorithm is for
destination IP load balancing. It is usually used in cache cluster.
This algorithm usually directs packet destined for an IP address to
@@ -189,7 +191,7 @@ config IP_VS_LBLC
config IP_VS_LBLCR
tristate "locality-based least-connection with replication scheduling"
- ---help---
+ help
The locality-based least-connection with replication scheduling
algorithm is also for destination IP load balancing. It is
usually used in cache cluster. It differs from the LBLC scheduling
@@ -207,7 +209,7 @@ config IP_VS_LBLCR
config IP_VS_DH
tristate "destination hashing scheduling"
- ---help---
+ help
The destination hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their destination IP addresses.
@@ -217,7 +219,7 @@ config IP_VS_DH
config IP_VS_SH
tristate "source hashing scheduling"
- ---help---
+ help
The source hashing scheduling algorithm assigns network
connections to the servers through looking up a statically assigned
hash table by their source IP addresses.
@@ -227,7 +229,7 @@ config IP_VS_SH
config IP_VS_MH
tristate "maglev hashing scheduling"
- ---help---
+ help
The maglev consistent hashing scheduling algorithm provides the
Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
network connections to the servers through looking up a statically
@@ -246,7 +248,7 @@ config IP_VS_MH
config IP_VS_SED
tristate "shortest expected delay scheduling"
- ---help---
+ help
The shortest expected delay scheduling algorithm assigns network
connections to the server with the shortest expected delay. The
expected delay that the job will experience is (Ci + 1) / Ui if
@@ -259,7 +261,7 @@ config IP_VS_SED
config IP_VS_NQ
tristate "never queue scheduling"
- ---help---
+ help
The never queue scheduling algorithm adopts a two-speed model.
When there is an idle server available, the job will be sent to
the idle server, instead of waiting for a fast one. When there
@@ -270,13 +272,24 @@ config IP_VS_NQ
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_TWOS
+ tristate "weighted random twos choice least-connection scheduling"
+ help
+ The weighted random twos choice least-connection scheduling
+ algorithm picks two random real servers and directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
comment 'IPVS SH scheduler'
config IP_VS_SH_TAB_BITS
int "IPVS source hashing table size (the Nth power of 2)"
range 4 20
default 8
- ---help---
+ help
The source hashing scheduler maps source IPs to destinations
stored in a hash table. This table is tiled by each destination
until all slots in the table are filled. When using weights to
@@ -291,7 +304,7 @@ config IP_VS_MH_TAB_INDEX
int "IPVS maglev hashing table index of size (the prime numbers)"
range 8 17
default 12
- ---help---
+ help
The maglev hashing scheduler maps source IPs to destinations
stored in a hash table. This table is assigned by a preference
list of the positions to each destination until all slots in
@@ -306,11 +319,11 @@ config IP_VS_MH_TAB_INDEX
comment 'IPVS application helper'
config IP_VS_FTP
- tristate "FTP protocol helper"
+ tristate "FTP protocol helper"
depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
NF_CONNTRACK_FTP
select IP_VS_NFCT
- ---help---
+ help
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
the IP address and port number of real servers cannot be sent to
@@ -324,16 +337,16 @@ config IP_VS_FTP
config IP_VS_NFCT
bool "Netfilter connection tracking"
depends on NF_CONNTRACK
- ---help---
+ help
The Netfilter connection tracking support allows the IPVS
connection state to be exported to the Netfilter framework
for filtering purposes.
config IP_VS_PE_SIP
tristate "SIP persistence engine"
- depends on IP_VS_PROTO_UDP
+ depends on IP_VS_PROTO_UDP
depends on NF_CONNTRACK_SIP
- ---help---
+ help
Allow persistence based on the SIP Call-ID
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index bfce2677fda2..bb5d8125c82a 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_TWOS) += ip_vs_twos.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 7588aeaa605f..d54d7da58334 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_app.c: Application module support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
* is that ip_vs_app module handles the reverse direction (incoming requests
* and outgoing responses).
@@ -15,11 +11,9 @@
* IP_MASQ_APP application masquerading module
*
* Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -198,21 +192,29 @@ struct ip_vs_app *register_ip_vs_app(struct netns_ipvs *ipvs, struct ip_vs_app *
mutex_lock(&__ip_vs_app_mutex);
+ /* increase the module use count */
+ if (!ip_vs_use_count_inc()) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
list_for_each_entry(a, &ipvs->app_list, a_list) {
if (!strcmp(app->name, a->name)) {
err = -EEXIST;
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
goto out_unlock;
}
}
a = kmemdup(app, sizeof(*app), GFP_KERNEL);
if (!a) {
err = -ENOMEM;
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
goto out_unlock;
}
INIT_LIST_HEAD(&a->incs_list);
list_add(&a->a_list, &ipvs->app_list);
- /* increase the module use count */
- ip_vs_use_count_inc();
out_unlock:
mutex_unlock(&__ip_vs_app_mutex);
@@ -363,7 +365,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
@@ -440,7 +442,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
struct tcphdr *th;
__u32 seq;
- if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ if (skb_ensure_writable(skb, tcp_offset + sizeof(*th)))
return 0;
th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
@@ -596,13 +598,19 @@ static const struct seq_operations ip_vs_app_seq_ops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create_net("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_seq_ops,
- sizeof(struct seq_net_private));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_app", 0, ipvs->net->proc_net,
+ &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private)))
+ return -ENOMEM;
+#endif
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
unregister_ip_vs_app(ipvs, NULL /* all */);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
+#endif
}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 5b2b17867cb1..50cc492c7553 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
@@ -9,21 +10,14 @@
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
* and others. Many code here is taken from IP MASQ code of kernel 2.2.
*
* Changes:
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/interrupt.h>
#include <linux/in.h>
@@ -31,12 +25,12 @@
#include <linux/net.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/rcupdate_wait.h>
#include <net/net_namespace.h>
#include <net/ip_vs.h>
@@ -407,6 +401,8 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
{
unsigned int hash;
struct ip_vs_conn *cp, *ret=NULL;
+ const union nf_inet_addr *saddr;
+ __be16 sport;
/*
* Check for "full" addressed entries
@@ -416,10 +412,20 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
rcu_read_lock();
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
- if (p->vport == cp->cport && p->cport == cp->dport &&
- cp->af == p->af &&
+ if (p->vport != cp->cport)
+ continue;
+
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ sport = cp->vport;
+ saddr = &cp->vaddr;
+ } else {
+ sport = cp->dport;
+ saddr = &cp->daddr;
+ }
+
+ if (p->cport == sport && cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
- ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, saddr) &&
p->protocol == cp->protocol &&
cp->ipvs == p->ipvs) {
if (!__ip_vs_conn_get(cp))
@@ -812,9 +818,34 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head)
kmem_cache_free(ip_vs_conn_cachep, cp);
}
+/* Try to delete connection while not holding reference */
+static void ip_vs_conn_del(struct ip_vs_conn *cp)
+{
+ if (timer_delete(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ ip_vs_conn_expire(&cp->timer);
+ }
+}
+
+/* Try to delete connection while holding reference */
+static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
+{
+ if (timer_delete(&cp->timer)) {
+ /* Drop cp->control chain too */
+ if (cp->control)
+ cp->timeout = 0;
+ __ip_vs_conn_put(cp);
+ ip_vs_conn_expire(&cp->timer);
+ } else {
+ __ip_vs_conn_put(cp);
+ }
+}
+
static void ip_vs_conn_expire(struct timer_list *t)
{
- struct ip_vs_conn *cp = from_timer(cp, t, timer);
+ struct ip_vs_conn *cp = timer_container_of(cp, t, timer);
struct netns_ipvs *ipvs = cp->ipvs;
/*
@@ -828,18 +859,21 @@ static void ip_vs_conn_expire(struct timer_list *t)
struct ip_vs_conn *ct = cp->control;
/* delete the timer if it is activated by other users */
- del_timer(&cp->timer);
+ timer_delete(&cp->timer);
/* does anybody control me? */
if (ct) {
+ bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
+
ip_vs_control_del(cp);
/* Drop CTL or non-assured TPL if not used anymore */
- if (!cp->timeout && !atomic_read(&ct->n_control) &&
+ if (has_ref && !atomic_read(&ct->n_control) &&
(!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
!(ct->state & IP_VS_CTPL_S_ASSURED))) {
IP_VS_DBG(4, "drop controlling connection\n");
- ct->timeout = 0;
- ip_vs_conn_expire_now(ct);
+ ip_vs_conn_del_put(ct);
+ } else if (has_ref) {
+ __ip_vs_conn_put(ct);
}
}
@@ -850,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
* conntrack cleanup for the net.
*/
smp_rmb();
- if (ipvs->enable)
+ if (READ_ONCE(ipvs->enable))
ip_vs_conn_drop_conntrack(cp);
}
@@ -891,7 +925,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
/* Using mod_timer_pending will ensure the timer is not
- * modified after the final del_timer in ip_vs_conn_expire.
+ * modified after the final timer_delete in ip_vs_conn_expire.
*/
if (timer_pending(&cp->timer) &&
time_after(cp->timer.expires, jiffies))
@@ -1011,28 +1045,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
#ifdef CONFIG_PROC_FS
struct ip_vs_iter_state {
struct seq_net_private p;
- struct hlist_head *l;
+ unsigned int bucket;
+ unsigned int skip_elems;
};
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+static void *ip_vs_conn_array(struct ip_vs_iter_state *iter)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_iter_state *iter = seq->private;
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) {
+ unsigned int skip = 0;
+
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
/* __ip_vs_conn_get() is not needed by
* ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
*/
- if (pos-- == 0) {
- iter->l = &ip_vs_conn_tab[idx];
+ if (skip >= iter->skip_elems) {
+ iter->bucket = idx;
return cp;
}
+
+ ++skip;
}
+
+ iter->skip_elems = 0;
cond_resched_rcu();
}
+ iter->bucket = idx;
return NULL;
}
@@ -1041,9 +1082,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip_vs_iter_state *iter = seq->private;
- iter->l = NULL;
rcu_read_lock();
- return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+ if (*pos == 0) {
+ iter->skip_elems = 0;
+ iter->bucket = 0;
+ return SEQ_START_TOKEN;
+ }
+
+ return ip_vs_conn_array(iter);
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -1051,28 +1097,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
struct hlist_node *e;
- struct hlist_head *l = iter->l;
- int idx;
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_conn_array(seq, 0);
+ return ip_vs_conn_array(iter);
/* more on same hash chain? */
e = rcu_dereference(hlist_next_rcu(&cp->c_list));
- if (e)
+ if (e) {
+ iter->skip_elems++;
return hlist_entry(e, struct ip_vs_conn, c_list);
-
- idx = l - ip_vs_conn_tab;
- while (++idx < ip_vs_conn_tab_size) {
- hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- iter->l = &ip_vs_conn_tab[idx];
- return cp;
- }
- cond_resched_rcu();
}
- iter->l = NULL;
- return NULL;
+
+ iter->skip_elems = 0;
+ iter->bucket++;
+
+ return ip_vs_conn_array(iter);
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
@@ -1230,8 +1270,8 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
- static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
- static char todrop_counter[9] = {0};
+ static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static signed char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
@@ -1273,7 +1313,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
* Randomly scan 1/32 of the whole table every second
*/
for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
- unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+ unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
@@ -1322,8 +1362,7 @@ try_drop:
drop:
IP_VS_DBG(4, "drop connection\n");
- cp->timeout = 0;
- ip_vs_conn_expire_now(cp);
+ ip_vs_conn_del(cp);
}
cond_resched_rcu();
}
@@ -1346,19 +1385,15 @@ flush_again:
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (cp->ipvs != ipvs)
continue;
- /* As timers are expired in LIFO order, restart
- * the timer of controlling connection first, so
- * that it is expired after us.
- */
+ if (atomic_read(&cp->n_control))
+ continue;
cp_c = cp->control;
- /* cp->control is valid only with reference to cp */
- if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
IP_VS_DBG(4, "del controlling connection\n");
- ip_vs_conn_expire_now(cp_c);
- __ip_vs_conn_put(cp);
+ ip_vs_conn_del(cp_c);
}
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
}
cond_resched_rcu();
}
@@ -1371,6 +1406,45 @@ flush_again:
goto flush_again;
}
}
+
+#ifdef CONFIG_SYSCTL
+void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (cp->ipvs != ipvs)
+ continue;
+
+ dest = cp->dest;
+ if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
+ continue;
+
+ if (atomic_read(&cp->n_control))
+ continue;
+
+ cp_c = cp->control;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_del(cp);
+ if (cp_c && !atomic_read(&cp_c->n_control)) {
+ IP_VS_DBG(4, "del controlling connection\n");
+ ip_vs_conn_del(cp_c);
+ }
+ }
+ cond_resched_rcu();
+
+ /* netns clean up started, abort delayed work */
+ if (!READ_ONCE(ipvs->enable))
+ break;
+ }
+ rcu_read_unlock();
+}
+#endif
+
/*
* per netns init and exit
*/
@@ -1378,51 +1452,78 @@ int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
{
atomic_set(&ipvs->conn_count, 0);
- proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
- &ip_vs_conn_seq_ops, sizeof(struct ip_vs_iter_state));
- proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
- &ip_vs_conn_sync_seq_ops,
- sizeof(struct ip_vs_iter_state));
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
+ &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn;
+
+ if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
+ &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state)))
+ goto err_conn_sync;
+#endif
+
return 0;
+
+#ifdef CONFIG_PROC_FS
+err_conn_sync:
+ remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
+err_conn:
+ return -ENOMEM;
+#endif
}
void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
{
/* flush all the connection entries first */
ip_vs_conn_flush(ipvs);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
+#endif
}
int __init ip_vs_conn_init(void)
{
+ size_t tab_array_size;
+ int max_avail;
+#if BITS_PER_LONG > 32
+ int max = 27;
+#else
+ int max = 20;
+#endif
+ int min = 8;
int idx;
- /* Compute size and mask */
+ max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
+ max_avail -= 2; /* ~4 in hash row */
+ max_avail -= 1; /* IPVS up to 1/2 of mem */
+ max_avail -= order_base_2(sizeof(struct ip_vs_conn));
+ max = clamp(max_avail, min, max);
+ ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
/*
* Allocate the connection hash table and initialize its list heads
*/
- ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size,
- sizeof(*ip_vs_conn_tab)));
+ tab_array_size = array_size(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab));
+ ip_vs_conn_tab = kvmalloc_array(ip_vs_conn_tab_size,
+ sizeof(*ip_vs_conn_tab), GFP_KERNEL);
if (!ip_vs_conn_tab)
return -ENOMEM;
/* Allocate ip_vs_conn slab cache */
- ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
- sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
if (!ip_vs_conn_cachep) {
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
return -ENOMEM;
}
- pr_info("Connection hash table configured "
- "(size=%d, memory=%ldKbytes)\n",
- ip_vs_conn_tab_size,
- (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
+ ip_vs_conn_tab_size, tab_array_size / 1024);
IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
sizeof(struct ip_vs_conn));
@@ -1445,5 +1546,5 @@ void ip_vs_conn_cleanup(void)
rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
- vfree(ip_vs_conn_tab);
+ kvfree(ip_vs_conn_tab);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index fe9abf3cc10a..90d56f92c0f6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
@@ -9,11 +10,6 @@
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
* and others.
@@ -21,11 +17,9 @@
* Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
* Harald Welte don't use nfcache
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -39,6 +33,8 @@
#include <net/tcp.h>
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
+#include <net/gue.h>
+#include <net/gre.h>
#include <net/route.h>
#include <net/ip6_checksum.h>
#include <net/netns/generic.h> /* net_generic() */
@@ -53,6 +49,7 @@
#endif
#include <net/ip_vs.h>
+#include <linux/indirect_call_wrapper.h>
EXPORT_SYMBOL(register_ip_vs_scheduler);
@@ -70,6 +67,17 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
+#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
+#define SNAT_CALL(f, ...) \
+ INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
+#elif defined(CONFIG_IP_VS_PROTO_TCP)
+#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__)
+#elif defined(CONFIG_IP_VS_PROTO_UDP)
+#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__)
+#else
+#define SNAT_CALL(f, ...) f(__VA_ARGS__)
+#endif
+
static unsigned int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
@@ -123,21 +131,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.inpkts++;
- s->cnt.inbytes += skb->len;
+ u64_stats_inc(&s->cnt.inpkts);
+ u64_stats_add(&s->cnt.inbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -159,21 +167,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.outpkts++;
- s->cnt.outbytes += skb->len;
+ u64_stats_inc(&s->cnt.outpkts);
+ u64_stats_add(&s->cnt.outbytes, skb->len);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -191,17 +199,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
s = this_cpu_ptr(cp->dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
- s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
u64_stats_update_begin(&s->syncp);
- s->cnt.conns++;
+ u64_stats_inc(&s->cnt.conns);
u64_stats_update_end(&s->syncp);
local_bh_enable();
@@ -478,7 +486,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
iph->hdr_flags ^= IP_VS_HDR_INVERSE;
- cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph);
+ cp = INDIRECT_CALL_1(pp->conn_in_get,
+ ip_vs_conn_in_get_proto, svc->ipvs,
+ svc->af, skb, iph);
iph->hdr_flags ^= IP_VS_HDR_INVERSE;
if (cp) {
@@ -594,7 +604,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
iph->protocol == IPPROTO_UDP) ?
IP_VS_CONN_F_ONE_PACKET : 0;
- union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
+ union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
@@ -671,16 +681,10 @@ static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
return ipvs->sysctl_nat_icmp_send;
}
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
-{
- return ipvs->sysctl_expire_nodest_conn;
-}
-
#else
static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
-static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
#endif
@@ -725,12 +729,12 @@ static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
struct dst_entry *dst = skb_dst(skb);
if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
- ip6_route_me_harder(ipvs->net, skb) != 0)
+ ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0)
return 1;
} else
#endif
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
- ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
+ ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0)
return 1;
return 0;
@@ -858,7 +862,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
unsigned int verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ goto after_nat;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -871,7 +875,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
IPPROTO_SCTP == protocol)
offset += 2 * sizeof(__u16);
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto out;
#ifdef CONFIG_IP_VS_IPV6
@@ -884,6 +888,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
goto out;
+after_nat:
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
@@ -892,8 +897,6 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
-
-ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -972,7 +975,8 @@ static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(ipvs, AF_INET, skb, &ciph);
+ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
+ ipvs, AF_INET, skb, &ciph);
if (!cp)
return NF_ACCEPT;
@@ -1028,7 +1032,8 @@ static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
return NF_ACCEPT;
/* The embedded headers contain source and dest in reverse order */
- cp = pp->conn_out_get(ipvs, AF_INET6, skb, &ciph);
+ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
+ ipvs, AF_INET6, skb, &ciph);
if (!cp)
return NF_ACCEPT;
@@ -1134,7 +1139,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
__be16 vport;
unsigned int flags;
- EnterFunction(12);
vaddr = &svc->addr;
vport = svc->port;
daddr = &iph->saddr;
@@ -1202,7 +1206,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
cp->flags, refcount_read(&cp->refcnt));
- LeaveFunction(12);
return cp;
}
@@ -1257,13 +1260,17 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
{
struct ip_vs_protocol *pp = pd->pp;
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto after_nat;
+
IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
- if (!skb_make_writable(skb, iph->len))
+ if (skb_ensure_writable(skb, iph->len))
goto drop;
/* mangle the packet */
- if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
+ if (pp->snat_handler &&
+ !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
goto drop;
#ifdef CONFIG_IP_VS_IPV6
@@ -1296,6 +1303,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
+after_nat:
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
skb->ipvs_property = 1;
@@ -1305,13 +1313,11 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
- LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
- LeaveFunction(11);
return NF_STOLEN;
}
@@ -1319,16 +1325,17 @@ drop:
* Check if outgoing packet belongs to the established ip_vs_conn.
*/
static unsigned int
-ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ int af = state->pf;
struct sock *sk;
- EnterFunction(11);
-
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
return NF_ACCEPT;
@@ -1338,16 +1345,13 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- if (!ipvs->enable)
- return NF_ACCEPT;
-
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1389,13 +1393,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
/*
* Check if the packet belongs to an existing entry
*/
- cp = pp->conn_out_get(ipvs, af, skb, &iph);
+ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
+ ipvs, af, skb, &iph);
- if (likely(cp)) {
- if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
- goto ignore_cp;
+ if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
- }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1454,66 +1456,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
-out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
-
-ignore_cp:
- __ip_vs_conn_put(cp);
- goto out;
-}
-
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
}
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
- * used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
- * Check if packet is reply for established ip_vs_conn.
- */
-static unsigned int
-ip_vs_local_reply6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-#endif
-
static unsigned int
ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
@@ -1536,14 +1483,12 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
"ip_vs_in: packet continues traversal as normal");
- if (iph->fragoffs) {
- /* Fragment that couldn't be mapped to a conn entry
- * is missing module nf_defrag_ipv6
- */
- IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+
+ /* Fragment couldn't be mapped to a conn entry */
+ if (iph->fragoffs)
IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
"unhandled fragment");
- }
+
*verdict = NF_ACCEPT;
return 0;
}
@@ -1551,6 +1496,77 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
return 1;
}
+/* Check the UDP tunnel and return its header length */
+static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ unsigned int offset, __u16 af,
+ const union nf_inet_addr *daddr, __u8 *proto)
+{
+ struct udphdr _udph, *udph;
+ struct ip_vs_dest *dest;
+
+ udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!udph)
+ goto unk;
+ offset += sizeof(struct udphdr);
+ dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
+ if (!dest)
+ goto unk;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ struct guehdr _gueh, *gueh;
+
+ gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
+ if (!gueh)
+ goto unk;
+ if (gueh->control != 0 || gueh->version != 0)
+ goto unk;
+ /* Later we can support also IPPROTO_IPV6 */
+ if (gueh->proto_ctype != IPPROTO_IPIP)
+ goto unk;
+ *proto = gueh->proto_ctype;
+ return sizeof(struct udphdr) + sizeof(struct guehdr) +
+ (gueh->hlen << 2);
+ }
+
+unk:
+ return 0;
+}
+
+/* Check the GRE tunnel and return its header length */
+static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
+ unsigned int offset, __u16 af,
+ const union nf_inet_addr *daddr, __u8 *proto)
+{
+ struct gre_base_hdr _greh, *greh;
+ struct ip_vs_dest *dest;
+
+ greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh);
+ if (!greh)
+ goto unk;
+ dest = ip_vs_find_tunnel(ipvs, af, daddr, 0);
+ if (!dest)
+ goto unk;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be16 type;
+
+ /* Only support version 0 and C (csum) */
+ if ((greh->flags & ~GRE_CSUM) != 0)
+ goto unk;
+ type = greh->protocol;
+ /* Later we can support also IPPROTO_IPV6 */
+ if (type != htons(ETH_P_IP))
+ goto unk;
+ *proto = IPPROTO_IPIP;
+
+ gre_flags_to_tnl_flags(flags, greh->flags);
+
+ return gre_calc_hlen(flags);
+ }
+
+unk:
+ return 0;
+}
+
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
@@ -1569,7 +1585,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip, new_cp = false;
+ bool tunnel, new_cp = false;
+ union nf_inet_addr *raddr;
+ char *outer_proto = "IPIP";
*related = 1;
@@ -1608,20 +1626,59 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
+ raddr = (union nf_inet_addr *)&cih->daddr;
- /* Special case for errors for IPIP packets */
- ipip = false;
+ /* Special case for errors for IPIP/UDP/GRE tunnel packets */
+ tunnel = false;
if (cih->protocol == IPPROTO_IPIP) {
+ struct ip_vs_dest *dest;
+
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
/* Error for our IPIP must arrive at LOCAL_IN */
if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
return NF_ACCEPT;
+ dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
+ /* Only for known tunnel */
+ if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
+ return NF_ACCEPT;
offset += cih->ihl * 4;
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ipip = true;
+ tunnel = true;
+ } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */
+ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */
+ /* Error for our tunnel must arrive at LOCAL_IN */
+ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
+ __u8 iproto;
+ int ulen;
+
+ /* Non-first fragment has no UDP/GRE header */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ offset2 = offset + cih->ihl * 4;
+ if (cih->protocol == IPPROTO_UDP) {
+ ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
+ raddr, &iproto);
+ outer_proto = "UDP";
+ } else {
+ ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
+ raddr, &iproto);
+ outer_proto = "GRE";
+ }
+ if (ulen > 0) {
+ /* Skip IP and UDP/GRE tunnel headers */
+ offset = offset2 + ulen;
+ /* Now we should be at the original IP header */
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph),
+ &_ciph);
+ if (cih && cih->version == 4 && cih->ihl >= 5 &&
+ iproto == IPPROTO_IPIP)
+ tunnel = true;
+ else
+ return NF_ACCEPT;
+ }
}
pd = ip_vs_proto_data_get(ipvs, cih->protocol);
@@ -1638,18 +1695,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
offset = ciph.len;
/* The embedded headers contain source and dest in reverse order.
- * For IPIP this is error for request, not for reply.
+ * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
*/
- cp = pp->conn_in_get(ipvs, AF_INET, skb, &ciph);
+ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
+ ipvs, AF_INET, skb, &ciph);
if (!cp) {
int v;
- if (!sysctl_schedule_icmp(ipvs))
+ if (tunnel || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
@@ -1667,7 +1725,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
goto out;
}
- if (ipip) {
+ if (tunnel) {
__be32 info = ic->un.gateway;
__u8 type = ic->type;
__u8 code = ic->code;
@@ -1679,17 +1737,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
u32 mtu = ntohs(ic->un.frag.mtu);
__be16 frag_off = cih->frag_off;
- /* Strip outer IP and ICMP, go to IPIP header */
+ /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
offset2 -= ihl + sizeof(_icmph);
skb_reset_network_header(skb);
- IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
+ outer_proto, &ip_hdr(skb)->saddr,
+ &ip_hdr(skb)->daddr, mtu);
ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
- goto ignore_ipip;
+ goto ignore_tunnel;
/* Prefer the resulting PMTU */
if (dest) {
struct ip_vs_dest_dst *dest_dst;
@@ -1702,11 +1761,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
mtu -= sizeof(struct iphdr);
info = htonl(mtu);
}
- /* Strip outer IP, ICMP and IPIP, go to IP header of
+ /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of
* original request.
*/
if (pskb_pull(skb, offset2) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
skb_reset_network_header(skb);
IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1715,7 +1774,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
/* ICMP can be shorter but anyways, account it */
ip_vs_out_stats(cp, skb);
-ignore_ipip:
+ignore_tunnel:
consume_skb(skb);
verdict = NF_STOLEN;
goto out;
@@ -1796,7 +1855,8 @@ static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
/* The embedded headers contain source and dest in reverse order
* if not from localhost
*/
- cp = pp->conn_in_get(ipvs, AF_INET6, skb, &ciph);
+ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
+ ipvs, AF_INET6, skb, &ciph);
if (!cp) {
int v;
@@ -1844,15 +1904,17 @@ out:
* and send it on its way...
*/
static unsigned int
-ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
+ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
+ struct netns_ipvs *ipvs = net_ipvs(state->net);
+ unsigned int hooknum = state->hook;
struct ip_vs_iphdr iph;
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
int ret, pkts;
- int conn_reuse_mode;
struct sock *sk;
+ int af = state->pf;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1874,7 +1936,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, false, &iph);
@@ -1884,7 +1946,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
+ if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
}
@@ -1925,18 +1987,20 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
/*
* Check if the packet belongs to an existing connection entry
*/
- cp = pp->conn_in_get(ipvs, af, skb, &iph);
+ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
+ ipvs, af, skb, &iph);
- conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
- if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
- bool uses_ct = false, resched = false;
+ if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+ int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+ bool old_ct = false, resched = false;
if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
unlikely(!atomic_read(&cp->dest->weight))) {
resched = true;
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
- } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
- uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
+ } else if (conn_reuse_mode &&
+ is_new_conn_expected(cp, conn_reuse_mode)) {
+ old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
if (!atomic_read(&cp->n_control)) {
resched = true;
} else {
@@ -1944,50 +2008,51 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
* that uses conntrack while it is still
* referenced by controlled connection(s).
*/
- resched = !uses_ct;
+ resched = !old_ct;
}
}
if (resched) {
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
if (!atomic_read(&cp->n_control))
ip_vs_conn_expire_now(cp);
__ip_vs_conn_put(cp);
- if (uses_ct)
+ if (old_ct)
return NF_DROP;
cp = NULL;
}
}
- if (unlikely(!cp)) {
- int v;
-
- if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
- return v;
- }
-
- IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
-
/* Check the server status */
- if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
/* the destination server is not available */
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
- __u32 flags = cp->flags;
+ if (!old_ct)
+ cp->flags &= ~IP_VS_CONN_F_NFCT;
- /* when timer already started, silently drop the packet.*/
- if (timer_pending(&cp->timer))
- __ip_vs_conn_put(cp);
- else
- ip_vs_conn_put(cp);
-
- if (sysctl_expire_nodest_conn(ipvs) &&
- !(flags & IP_VS_CONN_F_ONE_PACKET)) {
- /* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ if (old_ct)
+ return NF_DROP;
+ cp = NULL;
+ } else {
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
}
+ }
- return NF_DROP;
+ if (unlikely(!cp)) {
+ int v;
+
+ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
+ return v;
}
+ IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");
+
ip_vs_in_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
if (cp->packet_xmit)
@@ -2010,7 +2075,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
pkts = sysctl_sync_threshold(ipvs);
else
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -2023,55 +2088,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
}
/*
- * AF_INET handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-/*
- * AF_INET handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request4(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET);
-}
-
-#ifdef CONFIG_IP_VS_IPV6
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_IN chain
- * Schedule and forward packets from remote clients
- */
-static unsigned int
-ip_vs_remote_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-/*
- * AF_INET6 handler in NF_INET_LOCAL_OUT chain
- * Schedule and forward packets from local clients
- */
-static unsigned int
-ip_vs_local_request6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6);
-}
-
-#endif
-
-
-/*
* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
@@ -2084,45 +2100,36 @@ static unsigned int
ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
- int r;
struct netns_ipvs *ipvs = net_ipvs(state->net);
-
- if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
- return NF_ACCEPT;
+ int r;
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
- return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
-}
-
+ if (state->pf == NFPROTO_IPV4) {
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
#ifdef CONFIG_IP_VS_IPV6
-static unsigned int
-ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- int r;
- struct netns_ipvs *ipvs = net_ipvs(state->net);
- struct ip_vs_iphdr iphdr;
+ } else {
+ struct ip_vs_iphdr iphdr;
- ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- if (iphdr.protocol != IPPROTO_ICMPV6)
- return NF_ACCEPT;
+ ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr);
- /* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
- return NF_ACCEPT;
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
- return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
-}
+ return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr);
#endif
+ }
+ return ip_vs_in_icmp(ipvs, skb, &r, state->hook);
+}
-static const struct nf_hook_ops ip_vs_ops[] = {
+static const struct nf_hook_ops ip_vs_ops4[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 2,
@@ -2131,21 +2138,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request4,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST + 2,
@@ -2160,15 +2167,18 @@ static const struct nf_hook_ops ip_vs_ops[] = {
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply4,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
+};
+
#ifdef CONFIG_IP_VS_IPV6
+static const struct nf_hook_ops ip_vs_ops6[] = {
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 2,
@@ -2177,21 +2187,21 @@ static const struct nf_hook_ops ip_vs_ops[] = {
* or VS/NAT(change destination), so that filtering rules can be
* applied to IPVS. */
{
- .hook = ip_vs_remote_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC - 1,
},
/* Before ip_vs_in, change source only for VS/NAT */
{
- .hook = ip_vs_local_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 1,
},
/* After mangle, schedule and forward local requests */
{
- .hook = ip_vs_local_request6,
+ .hook = ip_vs_in_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST + 2,
@@ -2199,34 +2209,89 @@ static const struct nf_hook_ops ip_vs_ops[] = {
/* After packet filtering (but before ip_vs_out_icmp), catch icmp
* destined for 0.0.0.0/0, which is for incoming IPVS connections */
{
- .hook = ip_vs_forward_icmp_v6,
+ .hook = ip_vs_forward_icmp,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
/* After packet filtering, change source only for VS/NAT */
{
- .hook = ip_vs_reply6,
+ .hook = ip_vs_out_hook,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_FORWARD,
.priority = 100,
},
-#endif
};
+#endif
+
+int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+ int ret = 0;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return -EINVAL;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (!(ipvs->hooks_afmask & afmask)) {
+ ret = nf_register_net_hooks(ipvs->net, ops, count);
+ if (ret >= 0)
+ ipvs->hooks_afmask |= afmask;
+ }
+ return ret;
+}
+
+void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af)
+{
+ const struct nf_hook_ops *ops;
+ unsigned int count;
+ unsigned int afmask;
+
+ if (af == AF_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ ops = ip_vs_ops6;
+ count = ARRAY_SIZE(ip_vs_ops6);
+ afmask = 2;
+#else
+ return;
+#endif
+ } else {
+ ops = ip_vs_ops4;
+ count = ARRAY_SIZE(ip_vs_ops4);
+ afmask = 1;
+ }
+
+ if (ipvs->hooks_afmask & afmask) {
+ nf_unregister_net_hooks(ipvs->net, ops, count);
+ ipvs->hooks_afmask &= ~afmask;
+ }
+}
+
/*
* Initialize IP Virtual Server netns mem.
*/
static int __net_init __ip_vs_init(struct net *net)
{
struct netns_ipvs *ipvs;
- int ret;
ipvs = net_generic(net, ip_vs_net_id);
if (ipvs == NULL)
return -ENOMEM;
- /* Hold the beast until a service is registerd */
- ipvs->enable = 0;
+ /* Hold the beast until a service is registered */
+ WRITE_ONCE(ipvs->enable, 0);
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
@@ -2251,17 +2316,11 @@ static int __net_init __ip_vs_init(struct net *net)
if (ip_vs_sync_net_init(ipvs) < 0)
goto sync_fail;
- ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- if (ret < 0)
- goto hook_fail;
-
return 0;
/*
* Error handling
*/
-hook_fail:
- ip_vs_sync_net_cleanup(ipvs);
sync_fail:
ip_vs_conn_net_cleanup(ipvs);
conn_fail:
@@ -2277,40 +2336,48 @@ estimator_fail:
return -ENOMEM;
}
-static void __net_exit __ip_vs_cleanup(struct net *net)
+static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
- ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */
- ip_vs_conn_net_cleanup(ipvs);
- ip_vs_app_net_cleanup(ipvs);
- ip_vs_protocol_net_cleanup(ipvs);
- ip_vs_control_net_cleanup(ipvs);
- ip_vs_estimator_net_cleanup(ipvs);
- IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
- net->ipvs = NULL;
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
+ ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ ip_vs_conn_net_cleanup(ipvs);
+ ip_vs_app_net_cleanup(ipvs);
+ ip_vs_protocol_net_cleanup(ipvs);
+ ip_vs_control_net_cleanup(ipvs);
+ ip_vs_estimator_net_cleanup(ipvs);
+ IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
+ net->ipvs = NULL;
+ }
}
-static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
- EnterFunction(2);
- ipvs->enable = 0; /* Disable packet reception */
- smp_wmb();
- ip_vs_sync_net_cleanup(ipvs);
- LeaveFunction(2);
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ ip_vs_unregister_hooks(ipvs, AF_INET);
+ ip_vs_unregister_hooks(ipvs, AF_INET6);
+ WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(ipvs);
+ }
}
static struct pernet_operations ipvs_core_ops = {
.init = __ip_vs_init,
- .exit = __ip_vs_cleanup,
+ .exit_batch = __ip_vs_cleanup_batch,
.id = &ip_vs_net_id,
.size = sizeof(struct netns_ipvs),
};
static struct pernet_operations ipvs_core_dev_ops = {
- .exit = __ip_vs_dev_cleanup,
+ .exit_batch = __ip_vs_dev_cleanup_batch,
};
/*
@@ -2373,9 +2440,14 @@ static void __exit ip_vs_cleanup(void)
ip_vs_conn_cleanup();
ip_vs_protocol_cleanup();
ip_vs_control_cleanup();
+ /* common rcu_barrier() used by:
+ * - ip_vs_control_cleanup()
+ */
+ rcu_barrier();
pr_info("ipvs unloaded.\n");
}
module_init(ip_vs_init);
module_exit(ip_vs_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Virtual Server");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 432141f04af3..068702894377 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
@@ -9,17 +10,10 @@
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/init.h>
@@ -29,7 +23,6 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
-#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -43,6 +36,7 @@
#ifdef CONFIG_IP_VS_IPV6
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#endif
#include <net/route.h>
#include <net/sock.h>
@@ -52,8 +46,9 @@
#include <net/ip_vs.h>
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
+
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
/* sysctl variables */
@@ -97,8 +92,8 @@ static bool __ip_vs_addr_is_local_v6(struct net *net,
static void update_defense_level(struct netns_ipvs *ipvs)
{
struct sysinfo i;
- static int old_secure_tcp = 0;
int availmem;
+ int amemthresh;
int nomem;
int to_change = -1;
@@ -110,7 +105,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
/* si_swapinfo(&i); */
/* availmem = availmem - (i.totalswap - i.freeswap); */
- nomem = (availmem < ipvs->sysctl_amemthresh);
+ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
+ nomem = (availmem < amemthresh);
local_bh_disable();
@@ -150,9 +146,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 1:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
ipvs->sysctl_drop_packet = 2;
} else {
ipvs->drop_rate = 0;
@@ -160,9 +155,8 @@ static void update_defense_level(struct netns_ipvs *ipvs)
break;
case 2:
if (nomem) {
- ipvs->drop_rate = ipvs->drop_counter
- = ipvs->sysctl_amemthresh /
- (ipvs->sysctl_amemthresh-availmem);
+ ipvs->drop_counter = amemthresh / (amemthresh - availmem);
+ ipvs->drop_rate = ipvs->drop_counter;
} else {
ipvs->drop_rate = 0;
ipvs->sysctl_drop_packet = 1;
@@ -178,35 +172,35 @@ static void update_defense_level(struct netns_ipvs *ipvs)
spin_lock(&ipvs->securetcp_lock);
switch (ipvs->sysctl_secure_tcp) {
case 0:
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
break;
case 1:
if (nomem) {
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
ipvs->sysctl_secure_tcp = 2;
} else {
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
}
break;
case 2:
if (nomem) {
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
} else {
- if (old_secure_tcp >= 2)
+ if (ipvs->old_secure_tcp >= 2)
to_change = 0;
ipvs->sysctl_secure_tcp = 1;
}
break;
case 3:
- if (old_secure_tcp < 2)
+ if (ipvs->old_secure_tcp < 2)
to_change = 1;
break;
}
- old_secure_tcp = ipvs->sysctl_secure_tcp;
+ ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
if (to_change >= 0)
ip_vs_protocol_timeout_change(ipvs,
ipvs->sysctl_secure_tcp > 1);
@@ -215,6 +209,17 @@ static void update_defense_level(struct netns_ipvs *ipvs)
local_bh_enable();
}
+/* Handler for delayed work for expiring no
+ * destination connections
+ */
+static void expire_nodest_conn_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = container_of(work, struct netns_ipvs,
+ expire_nodest_conn_work.work);
+ ip_vs_expire_nodest_conn_flush(ipvs);
+}
/*
* Timer for checking the defense
@@ -229,10 +234,52 @@ static void defense_work_handler(struct work_struct *work)
update_defense_level(ipvs);
if (atomic_read(&ipvs->dropentry))
ip_vs_random_dropentry(ipvs);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
}
#endif
+static void est_reload_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, est_reload_work.work);
+ int genid_done = atomic_read(&ipvs->est_genid_done);
+ unsigned long delay = HZ / 10; /* repeat startups after failure */
+ bool repeat = false;
+ int genid;
+ int id;
+
+ mutex_lock(&ipvs->est_mutex);
+ genid = atomic_read(&ipvs->est_genid);
+ for (id = 0; id < ipvs->est_kt_count; id++) {
+ struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+ /* netns clean up started, abort delayed work */
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock;
+ if (!kd)
+ continue;
+ /* New config ? Stop kthread tasks */
+ if (genid != genid_done)
+ ip_vs_est_kthread_stop(kd);
+ if (!kd->task && !ip_vs_est_stopped(ipvs)) {
+ /* Do not start kthreads above 0 in calc phase */
+ if ((!id || !ipvs->est_calc_phase) &&
+ ip_vs_est_kthread_start(ipvs, kd) < 0)
+ repeat = true;
+ }
+ }
+
+ atomic_set(&ipvs->est_genid_done, genid);
+
+ if (repeat)
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+ delay);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+}
+
int
ip_vs_use_count_inc(void)
{
@@ -266,7 +313,7 @@ static inline unsigned int
ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto,
const union nf_inet_addr *addr, __be16 port)
{
- register unsigned int porth = ntohs(port);
+ unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
__u32 ahash;
@@ -428,7 +475,7 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
if (!svc && protocol == IPPROTO_TCP &&
atomic_read(&ipvs->ftpsvc_counter) &&
- (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) {
+ (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
/*
* Check if ftp service entry exists, the packet
* might belong to FTP data connections.
@@ -463,7 +510,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
static void ip_vs_service_free(struct ip_vs_service *svc)
{
- free_percpu(svc->stats.cpustats);
+ ip_vs_stats_release(&svc->stats);
kfree(svc);
}
@@ -475,17 +522,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
ip_vs_service_free(svc);
}
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
{
if (atomic_dec_and_test(&svc->refcnt)) {
IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
- if (do_delay)
- call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
- else
- ip_vs_service_free(svc);
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
}
@@ -497,7 +541,7 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
const union nf_inet_addr *addr,
__be16 port)
{
- register unsigned int porth = ntohs(port);
+ unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
#ifdef CONFIG_IP_VS_IPV6
@@ -514,15 +558,37 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
+ __be16 port;
if (dest->in_rs_table)
return;
+ switch (IP_VS_DFWD_METHOD(dest)) {
+ case IP_VS_CONN_F_MASQ:
+ port = dest->port;
+ break;
+ case IP_VS_CONN_F_TUNNEL:
+ switch (dest->tun_type) {
+ case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
+ port = dest->tun_port;
+ break;
+ case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
+ case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
+ port = 0;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+
/*
* Hash by proto,addr,port,
* which are the parameters of the real service.
*/
- hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
dest->in_rs_table = 1;
@@ -554,7 +620,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
/* HIT */
return true;
}
@@ -584,7 +651,37 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
- (dest->protocol == protocol || dest->vfwmark)) {
+ (dest->protocol == protocol || dest->vfwmark) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find real service record by <af,addr,tun_port>.
+ * In case of multiple records with the same <af,addr,tun_port>, only
+ * the first found record is returned.
+ *
+ * To be called under RCU lock.
+ */
+struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
+ const union nf_inet_addr *daddr,
+ __be16 tun_port)
+{
+ struct ip_vs_dest *dest;
+ unsigned int hash;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, tun_port);
+
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->tun_port == tun_port &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
/* HIT */
return dest;
}
@@ -719,14 +816,22 @@ out:
return dest;
}
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest;
+
+ dest = container_of(head, struct ip_vs_dest, rcu_head);
+ ip_vs_stats_release(&dest->stats);
+ ip_vs_dest_put_and_free(dest);
+}
+
static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
__ip_vs_dst_cache_reset(dest);
- __ip_vs_svc_put(svc, false);
- free_percpu(dest->stats.cpustats);
- ip_vs_dest_put_and_free(dest);
+ __ip_vs_svc_put(svc);
+ call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
}
/*
@@ -742,7 +847,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
{
struct ip_vs_dest *dest, *nxt;
- del_timer_sync(&ipvs->dest_trash_timer);
+ timer_delete_sync(&ipvs->dest_trash_timer);
/* No need to use dest_trash_lock */
list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
list_del(&dest->t_list);
@@ -750,12 +855,22 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
}
}
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_stats_rcu *rs = container_of(head,
+ struct ip_vs_stats_rcu,
+ rcu_head);
+
+ ip_vs_stats_release(&rs->s);
+ kfree(rs);
+}
+
static void
ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
{
#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
- spin_lock_bh(&src->lock);
+ spin_lock(&src->lock);
IP_VS_SHOW_STATS_COUNTER(conns);
IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -765,7 +880,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
ip_vs_read_estimator(dst, src);
- spin_unlock_bh(&src->lock);
+ spin_unlock(&src->lock);
}
static void
@@ -786,7 +901,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
static void
ip_vs_zero_stats(struct ip_vs_stats *stats)
{
- spin_lock_bh(&stats->lock);
+ spin_lock(&stats->lock);
/* get current counters as zero point, rates are zeroed */
@@ -800,7 +915,48 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
ip_vs_zero_estimator(stats);
- spin_unlock_bh(&stats->lock);
+ spin_unlock(&stats->lock);
+}
+
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+ int i;
+
+ spin_lock_init(&s->lock);
+ s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!s->cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+ u64_stats_init(&cs->syncp);
+ }
+ return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+ struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s && ip_vs_stats_init_alloc(s) >= 0)
+ return s;
+ kfree(s);
+ return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+ free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+ if (stats) {
+ ip_vs_stats_release(stats);
+ kfree(stats);
+ }
}
/*
@@ -830,20 +986,29 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
conn_flags |= IP_VS_CONN_F_INACTIVE;
+ /* Need to rehash? */
+ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
+ IP_VS_DFWD_METHOD(dest) ||
+ udest->tun_type != dest->tun_type ||
+ udest->tun_port != dest->tun_port)
+ ip_vs_rs_unhash(dest);
+
+ /* set the tunnel info */
+ dest->tun_type = udest->tun_type;
+ dest->tun_port = udest->tun_port;
+ dest->tun_flags = udest->tun_flags;
+
/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
conn_flags |= IP_VS_CONN_F_NOOUTPUT;
} else {
- /*
- * Put the real service in rs_table if not present.
- * For now only for NAT!
- */
- ip_vs_rs_hash(ipvs, dest);
/* FTP-NAT requires conntrack for mangling */
if (svc->port == FTPPORT)
ip_vs_register_conntrack(svc);
}
atomic_set(&dest->conn_flags, conn_flags);
+ /* Put the real service in rs_table if not present. */
+ ip_vs_rs_hash(ipvs, dest);
/* bind the service */
old_svc = rcu_dereference_protected(dest->svc, 1);
@@ -853,7 +1018,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (old_svc != svc) {
ip_vs_zero_stats(&dest->stats);
__ip_vs_bind_svc(dest, svc);
- __ip_vs_svc_put(old_svc, true);
+ __ip_vs_svc_put(old_svc);
}
}
@@ -872,7 +1037,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
spin_unlock_bh(&dest->dst_lock);
if (add) {
- ip_vs_start_estimator(svc->ipvs, &dest->stats);
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -890,13 +1054,11 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Create a destination for the given service
*/
static int
-ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
- struct ip_vs_dest **dest_p)
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
{
struct ip_vs_dest *dest;
- unsigned int atype, i;
-
- EnterFunction(2);
+ unsigned int atype;
+ int ret;
#ifdef CONFIG_IP_VS_IPV6
if (udest->af == AF_INET6) {
@@ -905,6 +1067,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atype & IPV6_ADDR_LINKLOCAL) &&
!__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
return -EINVAL;
+
+ ret = nf_defrag_ipv6_enable(svc->ipvs->net);
+ if (ret)
+ return ret;
} else
#endif
{
@@ -917,15 +1083,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
if (dest == NULL)
return -ENOMEM;
- dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!dest->stats.cpustats)
+ ret = ip_vs_stats_init_alloc(&dest->stats);
+ if (ret < 0)
goto err_alloc;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_dest_stats;
- ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
- u64_stats_init(&ip_vs_dest_stats->syncp);
- }
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ goto err_stats;
dest->af = udest->af;
dest->protocol = svc->protocol;
@@ -942,17 +1106,16 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
- spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
- *dest_p = dest;
-
- LeaveFunction(2);
return 0;
+err_stats:
+ ip_vs_stats_release(&dest->stats);
+
err_alloc:
kfree(dest);
- return -ENOMEM;
+ return ret;
}
@@ -967,8 +1130,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
__be16 dport = udest->port;
int ret;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -980,6 +1141,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}
+ if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
@@ -1007,15 +1175,16 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
+ ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+ if (ret < 0)
+ return ret;
__ip_vs_update_dest(svc, dest, udest, 1);
- ret = 0;
} else {
/*
* Allocate and initialize the dest structure
*/
- ret = ip_vs_new_dest(svc, udest, &dest);
+ ret = ip_vs_new_dest(svc, udest);
}
- LeaveFunction(2);
return ret;
}
@@ -1031,8 +1200,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
union nf_inet_addr daddr;
__be16 dport = udest->port;
- EnterFunction(2);
-
if (udest->weight < 0) {
pr_err("%s(): server weight less than zero\n", __func__);
return -ERANGE;
@@ -1044,6 +1211,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return -ERANGE;
}
+ if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if (udest->tun_port == 0) {
+ pr_err("%s(): tunnel port is zero\n", __func__);
+ return -EINVAL;
+ }
+ }
+
ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
/* We use function that requires RCU lock */
@@ -1057,7 +1231,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
}
__ip_vs_update_dest(svc, dest, udest, 0);
- LeaveFunction(2);
return 0;
}
@@ -1086,6 +1259,12 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
list_add(&dest->t_list, &ipvs->dest_trash);
dest->idle_start = 0;
spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ /* Queue up delayed work to expire all no destination connections.
+ * No-op when CONFIG_SYSCTL is disabled.
+ */
+ if (!cleanup)
+ ip_vs_enqueue_expire_nodest_conns(ipvs);
}
@@ -1126,8 +1305,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
struct ip_vs_dest *dest;
__be16 dport = udest->port;
- EnterFunction(2);
-
/* We use function that requires RCU lock */
rcu_read_lock();
dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
@@ -1148,14 +1325,13 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
*/
__ip_vs_del_dest(svc->ipvs, dest, false);
- LeaveFunction(2);
-
return 0;
}
static void ip_vs_dest_trash_expire(struct timer_list *t)
{
- struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer);
+ struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
+ dest_trash_timer);
struct ip_vs_dest *dest, *next;
unsigned long now = jiffies;
@@ -1191,13 +1367,15 @@ static int
ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
struct ip_vs_service **svc_p)
{
- int ret = 0, i;
+ int ret = 0;
struct ip_vs_scheduler *sched = NULL;
struct ip_vs_pe *pe = NULL;
struct ip_vs_service *svc = NULL;
+ int ret_hooks = -1;
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOPROTOOPT;
/* Lookup the scheduler by 'u->sched_name' */
if (strcmp(u->sched_name, "none")) {
@@ -1228,27 +1406,30 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ret = -EINVAL;
goto out_err;
}
+
+ ret = nf_defrag_ipv6_enable(ipvs->net);
+ if (ret)
+ goto out_err;
}
#endif
+ if ((u->af == AF_INET && !ipvs->num_services) ||
+ (u->af == AF_INET6 && !ipvs->num_services6)) {
+ ret = ip_vs_register_hooks(ipvs, u->af);
+ if (ret < 0)
+ goto out_err;
+ ret_hooks = ret;
+ }
+
svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
if (svc == NULL) {
IP_VS_DBG(1, "%s(): no memory\n", __func__);
ret = -ENOMEM;
goto out_err;
}
- svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!svc->stats.cpustats) {
- ret = -ENOMEM;
+ ret = ip_vs_stats_init_alloc(&svc->stats);
+ if (ret < 0)
goto out_err;
- }
-
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ip_vs_stats;
- ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
- u64_stats_init(&ip_vs_stats->syncp);
- }
-
/* I'm the first user of the service */
atomic_set(&svc->refcnt, 0);
@@ -1258,14 +1439,13 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
svc->port = u->port;
svc->fwmark = u->fwmark;
- svc->flags = u->flags;
+ svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
svc->ipvs = ipvs;
INIT_LIST_HEAD(&svc->destinations);
spin_lock_init(&svc->sched_lock);
- spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
if (sched) {
@@ -1275,34 +1455,47 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
sched = NULL;
}
- /* Bind the ct retriever */
- RCU_INIT_POINTER(svc->pe, pe);
- pe = NULL;
+ ret = ip_vs_start_estimator(ipvs, &svc->stats);
+ if (ret < 0)
+ goto out_err;
/* Update the virtual service counters */
if (svc->port == FTPPORT)
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
- if (svc->pe && svc->pe->conn_out)
+ if (pe && pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
- ip_vs_start_estimator(ipvs, &svc->stats);
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services++;
+ else if (svc->af == AF_INET6)
+ ipvs->num_services6++;
/* Hash the service into the service table */
ip_vs_svc_hash(svc);
*svc_p = svc;
- /* Now there is a service - full throttle */
- ipvs->enable = 1;
+
+ if (!READ_ONCE(ipvs->enable)) {
+ /* Now there is a service - full throttle */
+ WRITE_ONCE(ipvs->enable, 1);
+
+ /* Start estimation for first time */
+ ip_vs_est_reload_start(ipvs);
+ }
+
return 0;
out_err:
+ if (ret_hooks >= 0)
+ ip_vs_unregister_hooks(ipvs, u->af);
if (svc != NULL) {
ip_vs_unbind_scheduler(svc, sched);
ip_vs_service_free(svc);
@@ -1418,9 +1611,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
- /* Count only IPv4 services for old get/setsockopt interface */
- if (svc->af == AF_INET)
+ if (svc->af == AF_INET) {
ipvs->num_services--;
+ if (!ipvs->num_services)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ } else if (svc->af == AF_INET6) {
+ ipvs->num_services6--;
+ if (!ipvs->num_services6)
+ ip_vs_unregister_hooks(ipvs, svc->af);
+ }
ip_vs_stop_estimator(svc->ipvs, &svc->stats);
@@ -1454,7 +1653,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/*
* Free the service if nobody refers to it
*/
- __ip_vs_svc_put(svc, true);
+ __ip_vs_svc_put(svc);
/* decrease the module use count */
ip_vs_use_count_dec();
@@ -1525,16 +1724,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
/*
* Delete service by {netns} in the service table.
- * Called by __ip_vs_cleanup()
+ * Called by __ip_vs_batch_cleanup()
*/
-void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
+void ip_vs_service_nets_cleanup(struct list_head *net_list)
{
- EnterFunction(2);
+ struct netns_ipvs *ipvs;
+ struct net *net;
+
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(ipvs, true);
+ list_for_each_entry(net, net_list, exit_list) {
+ ipvs = net_ipvs(net);
+ ip_vs_flush(ipvs, true);
+ }
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
}
/* Put all references for device (dst_cache) */
@@ -1572,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
- EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
@@ -1601,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
- LeaveFunction(2);
return NOTIFY_DONE;
}
@@ -1638,30 +1839,33 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
}
}
- ip_vs_zero_stats(&ipvs->tot_stats);
+ ip_vs_zero_stats(&ipvs->tot_stats->s);
return 0;
}
#ifdef CONFIG_SYSCTL
-static int zero;
-static int three = 3;
-
static int
-proc_do_defense_mode(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_do_defense_mode(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val = *valp;
int rc;
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
- if ((*valp < 0) || (*valp > 3)) {
- /* Restore the correct value */
- *valp = val;
+ if (val < 0 || val > 3) {
+ rc = -EINVAL;
} else {
+ *valp = val;
update_defense_level(ipvs);
}
}
@@ -1669,59 +1873,198 @@ proc_do_defense_mode(struct ctl_table *table, int write,
}
static int
-proc_do_sync_threshold(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_do_sync_threshold(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val[2];
int rc;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = table->maxlen,
+ .mode = table->mode,
+ };
- /* backup the value first */
+ mutex_lock(&ipvs->sync_mutex);
memcpy(val, valp, sizeof(val));
-
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (valp[0] < 0 || valp[1] < 0 ||
- (valp[0] >= valp[1] && valp[1]))) {
- /* Restore the correct value */
- memcpy(valp, val, sizeof(val));
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (write) {
+ if (val[0] < 0 || val[1] < 0 ||
+ (val[0] >= val[1] && val[1]))
+ rc = -EINVAL;
+ else
+ memcpy(valp, val, sizeof(val));
}
+ mutex_unlock(&ipvs->sync_mutex);
return rc;
}
static int
-proc_do_sync_mode(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+proc_do_sync_ports(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = table->data;
int val = *valp;
int rc;
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write && (*valp != val)) {
- if ((*valp < 0) || (*valp > 1)) {
- /* Restore the correct value */
+ if (val < 1 || !is_power_of_2(val))
+ rc = -EINVAL;
+ else
*valp = val;
- }
}
return rc;
}
-static int
-proc_do_sync_ports(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
+ void *buffer)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ cpumask_var_t newmask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpulist_parse(buffer, newmask);
+ if (ret)
+ goto out;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (!ipvs->est_cpulist_valid) {
+ if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ ipvs->est_cpulist_valid = 1;
+ }
+ cpumask_and(newmask, newmask, &current->cpus_mask);
+ cpumask_copy(*valp, newmask);
+ /* est_max_threads may depend on cpulist size */
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+ ipvs->est_calc_phase = 1;
+ ip_vs_est_reload_start(ipvs);
+
+unlock:
+ mutex_unlock(&ipvs->est_mutex);
+
+out:
+ free_cpumask_var(newmask);
+ return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
+ void *buffer, size_t size)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ cpumask_var_t *valp = table->data;
+ struct cpumask *mask;
+ int ret;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ if (ipvs->est_cpulist_valid)
+ mask = *valp;
+ else
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+ mutex_unlock(&ipvs->est_mutex);
+
+ return ret;
+}
+
+static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ /* Ignore both read and write(append) if *ppos not 0 */
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ /* proc_sys_call_handler() appends terminator */
+ ret = ipvs_proc_est_cpumask_set(table, buffer);
+ if (ret >= 0)
+ *ppos += *lenp;
+ } else {
+ /* proc_sys_call_handler() allocates 1 byte for terminator */
+ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+ if (ret >= 0) {
+ *lenp = ret;
+ *ppos += *lenp;
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
+ struct netns_ipvs *ipvs = table->extra2;
int *valp = table->data;
int val = *valp;
- int rc;
+ int ret;
- rc = proc_dointvec(table, write, buffer, lenp, ppos);
- if (write && (*valp != val)) {
- if (*valp < 1 || !is_power_of_2(*valp)) {
- /* Restore the correct value */
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ if (val < MIN_NICE || val > MAX_NICE) {
+ ret = -EINVAL;
+ } else {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
+ *valp = val;
+ ip_vs_est_reload_start(ipvs);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+ }
+ }
+ return ret;
+}
+
+static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct netns_ipvs *ipvs = table->extra2;
+ int *valp = table->data;
+ int val = *valp;
+ int ret;
+
+ struct ctl_table tmp_table = {
+ .data = &val,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ };
+
+ ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (write && ret >= 0) {
+ mutex_lock(&ipvs->est_mutex);
+ if (*valp != val) {
*valp = val;
+ ip_vs_est_reload_start(ipvs);
}
+ mutex_unlock(&ipvs->est_mutex);
}
- return rc;
+ return ret;
}
/*
@@ -1779,7 +2122,9 @@ static struct ctl_table vs_vars[] = {
.procname = "sync_version",
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_do_sync_mode,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "sync_ports",
@@ -1853,8 +2198,8 @@ static struct ctl_table vs_vars[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &three,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "nat_icmp_send",
@@ -1892,6 +2237,24 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "run_estimation",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_run_estimation,
+ },
+ {
+ .procname = "est_cpulist",
+ .maxlen = NR_CPUS, /* unused */
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_cpulist,
+ },
+ {
+ .procname = "est_nice",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipvs_proc_est_nice,
+ },
#ifdef CONFIG_IP_VS_DEBUG
{
.procname = "debug_level",
@@ -1901,7 +2264,6 @@ static struct ctl_table vs_vars[] = {
.proc_handler = proc_dointvec,
},
#endif
- { }
};
#endif
@@ -2128,7 +2490,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
seq_puts(seq,
" Conns Packets Packets Bytes Bytes\n");
- ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
(unsigned long long)show.conns,
(unsigned long long)show.inpkts,
@@ -2152,7 +2514,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
- struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_kstats kstats;
int i;
@@ -2169,13 +2531,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
u64 conns, inpkts, outpkts, inbytes, outbytes;
do {
- start = u64_stats_fetch_begin_irq(&u->syncp);
- conns = u->cnt.conns;
- inpkts = u->cnt.inpkts;
- outpkts = u->cnt.outpkts;
- inbytes = u->cnt.inbytes;
- outbytes = u->cnt.outbytes;
- } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+ start = u64_stats_fetch_begin(&u->syncp);
+ conns = u64_stats_read(&u->cnt.conns);
+ inpkts = u64_stats_read(&u->cnt.inpkts);
+ outpkts = u64_stats_read(&u->cnt.outpkts);
+ inbytes = u64_stats_read(&u->cnt.inbytes);
+ outbytes = u64_stats_read(&u->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&u->syncp, start));
seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
i, (u64)conns, (u64)inpkts,
@@ -2221,6 +2583,18 @@ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user
u->udp_timeout);
#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
+ u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
+ return -EINVAL;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
+ return -EINVAL;
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
if (u->tcp_timeout) {
pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
@@ -2310,10 +2684,11 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
udest->u_threshold = udest_compat->u_threshold;
udest->l_threshold = udest_compat->l_threshold;
udest->af = AF_INET;
+ udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
}
static int
-do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
{
struct net *net = sock_net(sk);
int ret;
@@ -2337,12 +2712,9 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
return -EINVAL;
}
- if (copy_from_user(arg, user, len) != 0)
+ if (copy_from_sockptr(arg, ptr, len) != 0)
return -EFAULT;
- /* increase the module use count */
- ip_vs_use_count_inc();
-
/* Handle daemons since they have another lock */
if (cmd == IP_VS_SO_SET_STARTDAEMON ||
cmd == IP_VS_SO_SET_STOPDAEMON) {
@@ -2355,15 +2727,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = -EINVAL;
if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
sizeof(cfg.mcast_ifn)) <= 0)
- goto out_dec;
+ return ret;
cfg.syncid = dm->syncid;
ret = start_sync_thread(ipvs, &cfg, dm->state);
} else {
- mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs, dm->state);
- mutex_unlock(&ipvs->sync_mutex);
}
- goto out_dec;
+ return ret;
}
mutex_lock(&__ip_vs_mutex);
@@ -2375,6 +2745,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
/* Set timeout values for (tcp tcpfin udp) */
ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
goto out_unlock;
+ } else if (!len) {
+ /* No more commands with len == 0 below */
+ ret = -EINVAL;
+ goto out_unlock;
}
usvc_compat = (struct ip_vs_service_user *)arg;
@@ -2453,15 +2827,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
ret = ip_vs_del_dest(svc, &udest);
break;
default:
+ WARN_ON_ONCE(1);
ret = -EINVAL;
+ break;
}
out_unlock:
mutex_unlock(&__ip_vs_mutex);
- out_dec:
- /* decrease the module use count */
- ip_vs_use_count_dec();
-
return ret;
}
@@ -2479,7 +2851,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
+ strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2673,13 +3045,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
mutex_lock(&ipvs->sync_mutex);
if (ipvs->sync_state & IP_VS_STATE_MASTER) {
d[0].state = IP_VS_STATE_MASTER;
- strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
+ strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
sizeof(d[0].mcast_ifn));
d[0].syncid = ipvs->mcfg.syncid;
}
if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
d[1].state = IP_VS_STATE_BACKUP;
- strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
+ strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
sizeof(d[1].mcast_ifn));
d[1].syncid = ipvs->bcfg.syncid;
}
@@ -2719,13 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_services *)arg;
- size = sizeof(*get) +
- sizeof(struct ip_vs_service_entry) * get->num_services;
+ size = struct_size(get, entrytable, get->num_services);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -2761,13 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_dests *)arg;
- size = sizeof(*get) +
- sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ size = struct_size(get, entrytable, get->num_dests);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -2869,12 +3239,15 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
[IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
[IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
[IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
+ [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
};
static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
struct ip_vs_kstats *kstats)
{
- struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+ struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
@@ -2904,7 +3277,7 @@ nla_put_failure:
static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
struct ip_vs_kstats *kstats)
{
- struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+ struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
if (!nl_stats)
return -EMSGSIZE;
@@ -2950,7 +3323,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_kstats kstats;
char *sched_name;
- nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+ nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
if (!nl_service)
return -EMSGSIZE;
@@ -3065,7 +3438,7 @@ static bool ip_vs_is_af_valid(int af)
static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
struct ip_vs_service_user_kern *usvc,
- struct nlattr *nla, int full_entry,
+ struct nlattr *nla, bool full_entry,
struct ip_vs_service **ret_svc)
{
struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
@@ -3074,8 +3447,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
/* Parse mandatory identifying service fields first */
if (nla == NULL ||
- nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla,
- ip_vs_svc_policy, NULL))
+ nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
return -EINVAL;
nla_af = attrs[IPVS_SVC_ATTR_AF];
@@ -3152,7 +3524,7 @@ static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
struct ip_vs_service *svc;
int ret;
- ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, 0, &svc);
+ ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
return ret ? ERR_PTR(ret) : svc;
}
@@ -3161,7 +3533,7 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
struct nlattr *nl_dest;
struct ip_vs_kstats kstats;
- nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+ nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
if (!nl_dest)
return -EMSGSIZE;
@@ -3172,6 +3544,12 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
IP_VS_CONN_F_FWD_MASK)) ||
nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
atomic_read(&dest->weight)) ||
+ nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
+ dest->tun_type) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
+ dest->tun_port) ||
+ nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
+ dest->tun_flags) ||
nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
@@ -3233,8 +3611,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
mutex_lock(&__ip_vs_mutex);
/* Try to find the service for which to dump destinations */
- if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX,
- ip_vs_cmd_policy, cb->extack))
+ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
goto out_err;
@@ -3262,7 +3639,7 @@ out_err:
}
static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
- struct nlattr *nla, int full_entry)
+ struct nlattr *nla, bool full_entry)
{
struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
struct nlattr *nla_addr, *nla_port;
@@ -3270,8 +3647,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
/* Parse mandatory identifying destination fields first */
if (nla == NULL ||
- nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla,
- ip_vs_dest_policy, NULL))
+ nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
return -EINVAL;
nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
@@ -3286,20 +3662,21 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
udest->port = nla_get_be16(nla_port);
- if (nla_addr_family)
- udest->af = nla_get_u16(nla_addr_family);
- else
- udest->af = 0;
+ udest->af = nla_get_u16_default(nla_addr_family, 0);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
- *nla_l_thresh;
+ *nla_l_thresh, *nla_tun_type, *nla_tun_port,
+ *nla_tun_flags;
nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+ nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
+ nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
+ nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
return -EINVAL;
@@ -3309,6 +3686,15 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
udest->weight = nla_get_u32(nla_weight);
udest->u_threshold = nla_get_u32(nla_u_thresh);
udest->l_threshold = nla_get_u32(nla_l_thresh);
+
+ if (nla_tun_type)
+ udest->tun_type = nla_get_u8(nla_tun_type);
+
+ if (nla_tun_port)
+ udest->tun_port = nla_get_be16(nla_tun_port);
+
+ if (nla_tun_flags)
+ udest->tun_flags = nla_get_u16(nla_tun_flags);
}
return 0;
@@ -3319,7 +3705,7 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
{
struct nlattr *nl_daemon;
- nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+ nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
if (!nl_daemon)
return -EMSGSIZE;
@@ -3412,7 +3798,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
return -EINVAL;
- strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
sizeof(c.mcast_ifn));
c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
@@ -3469,10 +3855,8 @@ static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
if (!attrs[IPVS_DAEMON_ATTR_STATE])
return -EINVAL;
- mutex_lock(&ipvs->sync_mutex);
ret = stop_sync_thread(ipvs,
nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
- mutex_unlock(&ipvs->sync_mutex);
return ret;
}
@@ -3507,9 +3891,7 @@ static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
- nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
- info->attrs[IPVS_CMD_ATTR_DAEMON],
- ip_vs_daemon_policy, info->extack))
+ nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
goto out;
if (cmd == IPVS_CMD_NEW_DAEMON)
@@ -3524,11 +3906,11 @@ out:
static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
{
+ bool need_full_svc = false, need_full_dest = false;
struct ip_vs_service *svc = NULL;
struct ip_vs_service_user_kern usvc;
struct ip_vs_dest_user_kern udest;
int ret = 0, cmd;
- int need_full_svc = 0, need_full_dest = 0;
struct net *net = sock_net(skb->sk);
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -3552,7 +3934,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
* received a valid one. We need a full service specification when
* adding / editing a service. Only identifying members otherwise. */
if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
- need_full_svc = 1;
+ need_full_svc = true;
ret = ip_vs_genl_parse_service(ipvs, &usvc,
info->attrs[IPVS_CMD_ATTR_SERVICE],
@@ -3572,7 +3954,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
cmd == IPVS_CMD_DEL_DEST) {
if (cmd != IPVS_CMD_DEL_DEST)
- need_full_dest = 1;
+ need_full_dest = true;
ret = ip_vs_genl_parse_dest(&udest,
info->attrs[IPVS_CMD_ATTR_DEST],
@@ -3750,97 +4132,101 @@ out:
}
-static const struct genl_ops ip_vs_genl_ops[] = {
+static const struct genl_small_ops ip_vs_genl_ops[] = {
{
.cmd = IPVS_CMD_NEW_SERVICE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_SERVICE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_SERVICE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_SERVICE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
.dumpit = ip_vs_genl_dump_services,
- .policy = ip_vs_cmd_policy,
},
{
.cmd = IPVS_CMD_NEW_DEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_SET_DEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_DEL_DEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_DEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.dumpit = ip_vs_genl_dump_dests,
},
{
.cmd = IPVS_CMD_NEW_DAEMON,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_DEL_DAEMON,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_daemon,
},
{
.cmd = IPVS_CMD_GET_DAEMON,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.dumpit = ip_vs_genl_dump_daemons,
},
{
.cmd = IPVS_CMD_SET_CONFIG,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_GET_CONFIG,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_GET_INFO,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_get_cmd,
},
{
.cmd = IPVS_CMD_ZERO,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
- .policy = ip_vs_cmd_policy,
.doit = ip_vs_genl_set_cmd,
},
{
.cmd = IPVS_CMD_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = GENL_ADMIN_PERM,
.doit = ip_vs_genl_set_cmd,
},
@@ -3851,10 +4237,12 @@ static struct genl_family ip_vs_genl_family __ro_after_init = {
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
.maxattr = IPVS_CMD_ATTR_MAX,
+ .policy = ip_vs_cmd_policy,
.netnsok = true, /* Make ipvsadm to work on netns */
.module = THIS_MODULE,
- .ops = ip_vs_genl_ops,
- .n_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .small_ops = ip_vs_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
+ .resv_start_op = IPVS_CMD_FLUSH + 1,
};
static int __init ip_vs_genl_register(void)
@@ -3876,22 +4264,24 @@ static void ip_vs_genl_unregister(void)
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
- int idx;
struct ctl_table *tbl;
+ int idx, ret;
+ size_t ctl_table_size = ARRAY_SIZE(vs_vars);
+ bool unpriv = net->user_ns != &init_user_ns;
atomic_set(&ipvs->dropentry, 0);
spin_lock_init(&ipvs->dropentry_lock);
spin_lock_init(&ipvs->droppacket_lock);
spin_lock_init(&ipvs->securetcp_lock);
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+ expire_nodest_conn_handler);
+ ipvs->est_stopped = 0;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
if (tbl == NULL)
return -ENOMEM;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
} else
tbl = vs_vars;
/* Initialize sysctl defaults */
@@ -3917,10 +4307,17 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_ports = 1;
tbl[idx++].data = &ipvs->sysctl_sync_ports;
tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+
ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+
ipvs->sysctl_sync_sock_size = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+
tbl[idx++].data = &ipvs->sysctl_cache_bypass;
tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
@@ -3929,6 +4326,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx].extra2 = ipvs;
tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
@@ -3943,29 +4341,66 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
- ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
- if (ipvs->sysctl_hdr == NULL) {
- if (!net_eq(net, &init_net))
- kfree(tbl);
- return -ENOMEM;
- }
- ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
+ ipvs->sysctl_run_estimation = 1;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+ ipvs->est_cpulist_valid = 0;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+ ipvs->sysctl_est_nice = IPVS_EST_NICE;
+ if (unpriv)
+ tbl[idx].mode = 0444;
+ tbl[idx].extra2 = ipvs;
+ tbl[idx++].data = &ipvs->sysctl_est_nice;
+
+#ifdef CONFIG_IP_VS_DEBUG
+ /* Global sysctls must be ro in non-init netns */
+ if (!net_eq(net, &init_net))
+ tbl[idx++].mode = 0444;
+#endif
+
+ ret = -ENOMEM;
+ ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
+ ctl_table_size);
+ if (!ipvs->sysctl_hdr)
+ goto err;
ipvs->sysctl_tbl = tbl;
+
+ ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+ if (ret < 0)
+ goto err;
+
/* Schedule defense work */
- INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
- schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+ queue_delayed_work(system_long_wq, &ipvs->defense_work,
+ DEFENSE_TIMER_PERIOD);
return 0;
+
+err:
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return ret;
}
static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
{
struct net *net = ipvs->net;
+ cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
cancel_delayed_work_sync(&ipvs->defense_work);
cancel_work_sync(&ipvs->defense_work.work);
unregister_net_sysctl_table(ipvs->sysctl_hdr);
- ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+
+ if (ipvs->est_cpulist_valid)
+ free_cpumask_var(ipvs->sysctl_est_cpulist);
if (!net_eq(net, &init_net))
kfree(ipvs->sysctl_tbl);
@@ -3987,7 +4422,8 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- int i, idx;
+ int ret = -ENOMEM;
+ int idx;
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4000,44 +4436,66 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
- /* procfs stats */
- ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
- if (!ipvs->tot_stats.cpustats)
- return -ENOMEM;
+ INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *ipvs_tot_stats;
- ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
- u64_stats_init(&ipvs_tot_stats->syncp);
- }
-
- spin_lock_init(&ipvs->tot_stats.lock);
+ /* procfs stats */
+ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+ if (!ipvs->tot_stats)
+ goto out;
+ if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
+ goto err_tot_stats;
- proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops,
- sizeof(struct ip_vs_iter));
- proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
- ip_vs_stats_show, NULL);
- proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
- ip_vs_stats_percpu_show, NULL);
+#ifdef CONFIG_PROC_FS
+ if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
+ &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
+ goto err_vs;
+ if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
+ ip_vs_stats_show, NULL))
+ goto err_stats;
+ if (!proc_create_net_single("ip_vs_stats_percpu", 0,
+ ipvs->net->proc_net,
+ ip_vs_stats_percpu_show, NULL))
+ goto err_percpu;
+#endif
- if (ip_vs_control_net_init_sysctl(ipvs))
+ ret = ip_vs_control_net_init_sysctl(ipvs);
+ if (ret < 0)
goto err;
return 0;
err:
- free_percpu(ipvs->tot_stats.cpustats);
- return -ENOMEM;
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+
+err_percpu:
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+
+err_stats:
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
+
+err_vs:
+#endif
+ ip_vs_stats_release(&ipvs->tot_stats->s);
+
+err_tot_stats:
+ kfree(ipvs->tot_stats);
+
+out:
+ return ret;
}
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
+ cancel_delayed_work_sync(&ipvs->est_reload_work);
+#ifdef CONFIG_PROC_FS
remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
remove_proc_entry("ip_vs", ipvs->net->proc_net);
- free_percpu(ipvs->tot_stats.cpustats);
+#endif
+ call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
}
int __init ip_vs_register_nl_ioctl(void)
@@ -4074,8 +4532,6 @@ int __init ip_vs_control_init(void)
int idx;
int ret;
- EnterFunction(2);
-
/* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
@@ -4088,14 +4544,12 @@ int __init ip_vs_control_init(void)
if (ret < 0)
return ret;
- LeaveFunction(2);
return 0;
}
void ip_vs_control_cleanup(void)
{
- EnterFunction(2);
unregister_netdevice_notifier(&ip_vs_dst_notifier);
- LeaveFunction(2);
+ /* relying on common rcu_barrier() in ip_vs_cleanup() */
}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 07459e71d907..bb7aca4601ff 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Destination Hashing scheduling module
*
@@ -6,13 +7,7 @@
* Inspired by the consistent hashing scheduler patch from
* Thomas Proell <proellt@gmx.de>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
/*
@@ -35,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -275,3 +269,4 @@ static void __exit ip_vs_dh_cleanup(void)
module_init(ip_vs_dh_init);
module_exit(ip_vs_dh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs destination hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 489055091a9b..77f4f637ff67 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_est.c: simple rate estimator for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
* Network name space (netns) aware.
* Global data moved to netns i.e struct netns_ipvs
@@ -16,8 +12,7 @@
* get_stats()) do the per cpu summing.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/jiffies.h>
@@ -25,6 +20,7 @@
#include <linux/interrupt.h>
#include <linux/sysctl.h>
#include <linux/list.h>
+#include <linux/rcupdate_wait.h>
#include <net/ip_vs.h>
@@ -34,9 +30,6 @@
long interval, it is easy to implement a user level daemon which
periodically reads those statistical counters and measure rate.
- Currently, the measurement is activated by slow timer handler. Hope
- this measurement will not introduce too much load.
-
We measure rate during the last 8 seconds every 2 seconds:
avgrate = avgrate*(1-W) + rate*W
@@ -51,65 +44,79 @@
to 32-bit values for conns, packets, bps, cps and pps.
* A lot of code is taken from net/core/gen_estimator.c
- */
-
-/*
- * Make a summary from each cpu
+ KEY POINTS:
+ - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+ - kthreads read the cpustats to update the estimators (svcs, dests, total)
+ - the states of estimators can be read (get stats) or modified (zero stats)
+ from processes
+
+ KTHREADS:
+ - estimators are added initially to est_temp_list and later kthread 0
+ distributes them to one or many kthreads for estimation
+ - kthread contexts are created and attached to array
+ - the kthread tasks are started when first service is added, before that
+ the total stats are not estimated
+ - when configuration (cpulist/nice) is changed, the tasks are restarted
+ by work (est_reload_work)
+ - kthread tasks are stopped while the cpulist is empty
+ - the kthread context holds lists with estimators (chains) which are
+ processed every 2 seconds
+ - as estimators can be added dynamically and in bursts, we try to spread
+ them to multiple chains which are estimated at different time
+ - on start, kthread 0 enters calculation phase to determine the chain limits
+ and the limit of estimators per kthread
+ - est_add_ktid: ktid where to add new ests, can point to empty slot where
+ we should add kt data
*/
-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
- struct ip_vs_cpu_stats __percpu *stats)
-{
- int i;
- bool add = false;
- for_each_possible_cpu(i) {
- struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
- unsigned int start;
- u64 conns, inpkts, outpkts, inbytes, outbytes;
+static struct lock_class_key __ipvs_est_key;
- if (add) {
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- conns = s->cnt.conns;
- inpkts = s->cnt.inpkts;
- outpkts = s->cnt.outpkts;
- inbytes = s->cnt.inbytes;
- outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- sum->conns += conns;
- sum->inpkts += inpkts;
- sum->outpkts += outpkts;
- sum->inbytes += inbytes;
- sum->outbytes += outbytes;
- } else {
- add = true;
- do {
- start = u64_stats_fetch_begin(&s->syncp);
- sum->conns = s->cnt.conns;
- sum->inpkts = s->cnt.inpkts;
- sum->outpkts = s->cnt.outpkts;
- sum->inbytes = s->cnt.inbytes;
- sum->outbytes = s->cnt.outbytes;
- } while (u64_stats_fetch_retry(&s->syncp, start));
- }
- }
-}
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
-
-static void estimation_timer(struct timer_list *t)
+static void ip_vs_chain_estimation(struct hlist_head *chain)
{
struct ip_vs_estimator *e;
+ struct ip_vs_cpu_stats *c;
struct ip_vs_stats *s;
u64 rate;
- struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
- spin_lock(&ipvs->est_lock);
- list_for_each_entry(e, &ipvs->est_list, list) {
+ hlist_for_each_entry_rcu(e, chain, list) {
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+ u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+ u64 kinbytes = 0, koutbytes = 0;
+ unsigned int start;
+ int i;
+
+ if (kthread_should_stop())
+ break;
+
s = container_of(e, struct ip_vs_stats, est);
+ for_each_possible_cpu(i) {
+ c = per_cpu_ptr(s->cpustats, i);
+ do {
+ start = u64_stats_fetch_begin(&c->syncp);
+ conns = u64_stats_read(&c->cnt.conns);
+ inpkts = u64_stats_read(&c->cnt.inpkts);
+ outpkts = u64_stats_read(&c->cnt.outpkts);
+ inbytes = u64_stats_read(&c->cnt.inbytes);
+ outbytes = u64_stats_read(&c->cnt.outbytes);
+ } while (u64_stats_fetch_retry(&c->syncp, start));
+ kconns += conns;
+ kinpkts += inpkts;
+ koutpkts += outpkts;
+ kinbytes += inbytes;
+ koutbytes += outbytes;
+ }
spin_lock(&s->lock);
- ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+ s->kstats.conns = kconns;
+ s->kstats.inpkts = kinpkts;
+ s->kstats.outpkts = koutpkts;
+ s->kstats.inbytes = kinbytes;
+ s->kstats.outbytes = koutbytes;
/* scaled by 2^10, but divided 2 seconds */
rate = (s->kstats.conns - e->last_conns) << 9;
@@ -134,28 +141,759 @@ static void estimation_timer(struct timer_list *t)
e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
spin_unlock(&s->lock);
}
- spin_unlock(&ipvs->est_lock);
- mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
{
- struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ int cid;
+
+ rcu_read_lock();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ goto out;
+ for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+ if (kthread_should_stop())
+ break;
+ ip_vs_chain_estimation(&td->chains[cid]);
+ cond_resched_rcu();
+ td = rcu_dereference(kd->ticks[row]);
+ if (!td)
+ break;
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static int ip_vs_estimation_kthread(void *data)
+{
+ struct ip_vs_est_kt_data *kd = data;
+ struct netns_ipvs *ipvs = kd->ipvs;
+ int row = kd->est_row;
+ unsigned long now;
+ int id = kd->id;
+ long gap;
+
+ if (id > 0) {
+ if (!ipvs->est_chain_max)
+ return 0;
+ } else {
+ if (!ipvs->est_chain_max) {
+ ipvs->est_calc_phase = 1;
+ /* commit est_calc_phase before reading est_genid */
+ smp_mb();
+ }
+
+ /* kthread 0 will handle the calc phase */
+ if (ipvs->est_calc_phase)
+ ip_vs_est_calc_phase(ipvs);
+ }
+
+ while (1) {
+ if (!id && !hlist_empty(&ipvs->est_temp_list))
+ ip_vs_est_drain_temp_list(ipvs);
+ set_current_state(TASK_IDLE);
+ if (kthread_should_stop())
+ break;
+
+ /* before estimation, check if we should sleep */
+ now = jiffies;
+ gap = kd->est_timer - now;
+ if (gap > 0) {
+ if (gap > IPVS_EST_TICK) {
+ kd->est_timer = now - IPVS_EST_TICK;
+ gap = IPVS_EST_TICK;
+ }
+ schedule_timeout(gap);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ if (gap < -8 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ }
+
+ if (kd->tick_len[row])
+ ip_vs_tick_estimation(kd, row);
+
+ row++;
+ if (row >= IPVS_EST_NTICKS)
+ row = 0;
+ WRITE_ONCE(kd->est_row, row);
+ kd->est_timer += IPVS_EST_TICK;
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+ /* Ignore reloads before first service is added */
+ if (!READ_ONCE(ipvs->enable))
+ return;
+ ip_vs_est_stopped_recalc(ipvs);
+ /* Bump the kthread configuration genid */
+ atomic_inc(&ipvs->est_genid);
+ queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ unsigned long now;
+ int ret = 0;
+ long gap;
+
+ lockdep_assert_held(&ipvs->est_mutex);
+
+ if (kd->task)
+ goto out;
+ now = jiffies;
+ gap = kd->est_timer - now;
+ /* Sync est_timer if task is starting later */
+ if (abs(gap) > 4 * IPVS_EST_TICK)
+ kd->est_timer = now;
+ kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+ ipvs->gen, kd->id);
+ if (IS_ERR(kd->task)) {
+ ret = PTR_ERR(kd->task);
+ kd->task = NULL;
+ goto out;
+ }
+
+ set_user_nice(kd->task, sysctl_est_nice(ipvs));
+ if (sysctl_est_preferred_cpulist(ipvs))
+ kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs));
+
+ pr_info("starting estimator thread %d...\n", kd->id);
+ wake_up_process(kd->task);
+
+out:
+ return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+ if (kd->task) {
+ pr_info("stopping estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ kd->task = NULL;
+ }
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+ struct ip_vs_est_kt_data *kd)
+{
+ kd->chain_max = ipvs->est_chain_max;
+ /* We are using single chain on RCU preemption */
+ if (IPVS_EST_TICK_CHAINS == 1)
+ kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+ kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+ kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ int id = ipvs->est_kt_count;
+ int ret = -ENOMEM;
+ void *arr = NULL;
+ int i;
+
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
+ return -EINVAL;
+
+ mutex_lock(&ipvs->est_mutex);
+
+ for (i = 0; i < id; i++) {
+ if (!ipvs->est_kt_arr[i])
+ break;
+ }
+ if (i >= id) {
+ arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+ sizeof(struct ip_vs_est_kt_data *),
+ GFP_KERNEL);
+ if (!arr)
+ goto out;
+ ipvs->est_kt_arr = arr;
+ } else {
+ id = i;
+ }
+
+ kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+ if (!kd)
+ goto out;
+ kd->ipvs = ipvs;
+ bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+ kd->est_timer = jiffies;
+ kd->id = id;
+ ip_vs_est_set_params(ipvs, kd);
+
+ /* Pre-allocate stats used in calc phase */
+ if (!id && !kd->calc_stats) {
+ kd->calc_stats = ip_vs_stats_alloc();
+ if (!kd->calc_stats)
+ goto out;
+ }
+
+ /* Start kthread tasks only when services are present */
+ if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
+ ret = ip_vs_est_kthread_start(ipvs, kd);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (arr)
+ ipvs->est_kt_count++;
+ ipvs->est_kt_arr[id] = kd;
+ kd = NULL;
+ /* Use most recent kthread for new ests */
+ ipvs->est_add_ktid = id;
+ ret = 0;
+
+out:
+ mutex_unlock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+
+ return ret;
+}
+
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+ int ktid, best = ipvs->est_kt_count;
+ struct ip_vs_est_kt_data *kd;
+
+ for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+ kd = ipvs->est_kt_arr[ktid];
+ if (kd) {
+ if (kd->est_count < kd->est_max_count) {
+ best = ktid;
+ break;
+ }
+ } else if (ktid < best) {
+ best = ktid;
+ }
+ }
+ ipvs->est_add_ktid = best;
+}
- INIT_LIST_HEAD(&est->list);
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+ struct ip_vs_estimator *est)
+{
+ struct ip_vs_est_kt_data *kd = NULL;
+ struct ip_vs_est_tick_data *td;
+ int ktid, row, crow, cid, ret;
+ int delay = est->ktrow;
+
+ BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+ "Too many chains for ktcid");
+
+ if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+ if (kd)
+ goto add_est;
+ }
- spin_lock_bh(&ipvs->est_lock);
- list_add(&est->list, &ipvs->est_list);
- spin_unlock_bh(&ipvs->est_lock);
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret < 0)
+ goto out;
+ kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+ ktid = kd->id;
+ /* For small number of estimators prefer to use few ticks,
+ * otherwise try to add into the last estimated row.
+ * est_row and add_row point after the row we should use
+ */
+ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+ crow = READ_ONCE(kd->est_row);
+ else
+ crow = kd->add_row;
+ crow += delay;
+ if (crow >= IPVS_EST_NTICKS)
+ crow -= IPVS_EST_NTICKS;
+ /* Assume initial delay ? */
+ if (delay >= IPVS_EST_NTICKS - 1) {
+ /* Preserve initial delay or decrease it if no space in tick */
+ row = crow;
+ if (crow < IPVS_EST_NTICKS - 1) {
+ crow++;
+ row = find_last_bit(kd->avail, crow);
+ }
+ if (row >= crow)
+ row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+ } else {
+ /* Preserve delay or increase it if no space in tick */
+ row = IPVS_EST_NTICKS;
+ if (crow > 0)
+ row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+ if (row >= IPVS_EST_NTICKS)
+ row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+ }
+
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td) {
+ td = kzalloc(sizeof(*td), GFP_KERNEL);
+ if (!td) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rcu_assign_pointer(kd->ticks[row], td);
+ }
+
+ cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+ kd->est_count++;
+ kd->tick_len[row]++;
+ if (!td->chain_len[cid])
+ __set_bit(cid, td->present);
+ td->chain_len[cid]++;
+ est->ktid = ktid;
+ est->ktrow = row;
+ est->ktcid = cid;
+ hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+ if (td->chain_len[cid] >= kd->chain_max) {
+ __set_bit(cid, td->full);
+ if (kd->tick_len[row] >= kd->tick_max)
+ __clear_bit(row, kd->avail);
+ }
+
+ /* Update est_add_ktid to point to first available/empty kt slot */
+ if (kd->est_count == kd->est_max_count)
+ ip_vs_est_update_ktid(ipvs);
+
+ ret = 0;
+
+out:
+ return ret;
}
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ int ret;
+
+ if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
+ est->ktid = -1;
+ est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
+
+ /* We prefer this code to be short, kthread 0 will requeue the
+ * estimator to available chain. If tasks are disabled, we
+ * will not allocate much memory, just for kt 0.
+ */
+ ret = 0;
+ if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+ ret = ip_vs_est_add_kthread(ipvs);
+ if (ret >= 0)
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ else
+ INIT_HLIST_NODE(&est->list);
+ return ret;
+}
+
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+ if (kd) {
+ if (kd->task) {
+ pr_info("stop unused estimator thread %d...\n", kd->id);
+ kthread_stop(kd->task);
+ }
+ ip_vs_stats_free(kd->calc_stats);
+ kfree(kd);
+ }
+}
+
+/* Unlink estimator from chain */
void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ int ktid = est->ktid;
+ int row = est->ktrow;
+ int cid = est->ktcid;
+
+ /* Failed to add to chain ? */
+ if (hlist_unhashed(&est->list))
+ return;
+
+ /* On return, estimator can be freed, dequeue it now */
+
+ /* In est_temp_list ? */
+ if (ktid < 0) {
+ hlist_del(&est->list);
+ goto end_kt0;
+ }
- spin_lock_bh(&ipvs->est_lock);
- list_del(&est->list);
- spin_unlock_bh(&ipvs->est_lock);
+ hlist_del_rcu(&est->list);
+ kd = ipvs->est_kt_arr[ktid];
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ __clear_bit(cid, td->full);
+ td->chain_len[cid]--;
+ if (!td->chain_len[cid])
+ __clear_bit(cid, td->present);
+ kd->tick_len[row]--;
+ __set_bit(row, kd->avail);
+ if (!kd->tick_len[row]) {
+ RCU_INIT_POINTER(kd->ticks[row], NULL);
+ kfree_rcu(td, rcu_head);
+ }
+ kd->est_count--;
+ if (kd->est_count) {
+ /* This kt slot can become available just now, prefer it */
+ if (ktid < ipvs->est_add_ktid)
+ ipvs->est_add_ktid = ktid;
+ return;
+ }
+
+ if (ktid > 0) {
+ mutex_lock(&ipvs->est_mutex);
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[ktid] = NULL;
+ if (ktid == ipvs->est_kt_count - 1) {
+ ipvs->est_kt_count--;
+ while (ipvs->est_kt_count > 1 &&
+ !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+ ipvs->est_kt_count--;
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* This slot is now empty, prefer another available kt slot */
+ if (ktid == ipvs->est_add_ktid)
+ ip_vs_est_update_ktid(ipvs);
+ }
+
+end_kt0:
+ /* kt 0 is freed after all other kthreads and chains are empty */
+ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+ kd = ipvs->est_kt_arr[0];
+ if (!kd || !kd->est_count) {
+ mutex_lock(&ipvs->est_mutex);
+ if (kd) {
+ ip_vs_est_kthread_destroy(kd);
+ ipvs->est_kt_arr[0] = NULL;
+ }
+ ipvs->est_kt_count--;
+ mutex_unlock(&ipvs->est_mutex);
+ ipvs->est_add_ktid = 0;
+ }
+ }
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_estimator *est;
+
+ while (1) {
+ int max = 16;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ while (max-- > 0) {
+ est = hlist_entry_safe(ipvs->est_temp_list.first,
+ struct ip_vs_estimator, list);
+ if (est) {
+ if (kthread_should_stop())
+ goto unlock;
+ hlist_del_init(&est->list);
+ if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+ continue;
+ est->ktid = -1;
+ hlist_add_head(&est->list,
+ &ipvs->est_temp_list);
+ /* Abort, some entries will not be estimated
+ * until next attempt
+ */
+ }
+ goto unlock;
+ }
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ }
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ struct ip_vs_est_kt_data *kd;
+ struct hlist_head chain;
+ struct ip_vs_stats *s;
+ int cache_factor = 4;
+ int i, loops, ntest;
+ s32 min_est = 0;
+ ktime_t t1, t2;
+ int max = 8;
+ int ret = 1;
+ s64 diff;
+ u64 val;
+
+ INIT_HLIST_HEAD(&chain);
+ mutex_lock(&__ip_vs_mutex);
+ kd = ipvs->est_kt_arr[0];
+ mutex_unlock(&__ip_vs_mutex);
+ s = kd ? kd->calc_stats : NULL;
+ if (!s)
+ goto out;
+ hlist_add_head(&s->est.list, &chain);
+
+ loops = 1;
+ /* Get best result from many tests */
+ for (ntest = 0; ntest < 12; ntest++) {
+ if (!(ntest & 3)) {
+ /* Wait for cpufreq frequency transition */
+ wait_event_idle_timeout(wq, kthread_should_stop(),
+ HZ / 50);
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto stop;
+ }
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ /* Put stats in cache */
+ ip_vs_chain_estimation(&chain);
+
+ t1 = ktime_get();
+ for (i = loops * cache_factor; i > 0; i--)
+ ip_vs_chain_estimation(&chain);
+ t2 = ktime_get();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto stop;
+ cond_resched();
+
+ diff = ktime_to_ns(ktime_sub(t2, t1));
+ if (diff <= 1 * NSEC_PER_USEC) {
+ /* Do more loops on low time resolution */
+ loops *= 2;
+ continue;
+ }
+ if (diff >= NSEC_PER_SEC)
+ continue;
+ val = diff;
+ do_div(val, loops);
+ if (!min_est || val < min_est) {
+ min_est = val;
+ /* goal: 95usec per chain */
+ val = 95 * NSEC_PER_USEC;
+ if (val >= min_est) {
+ do_div(val, min_est);
+ max = (int)val;
+ } else {
+ max = 1;
+ }
+ }
+ }
+
+out:
+ if (s)
+ hlist_del_init(&s->est.list);
+ *chain_max = max;
+ return ret;
+
+stop:
+ ret = 0;
+ goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP ECM Insert Chain enable Description
+ * ---------------------------------------------------------------------------
+ * 0 0 est_temp_list 0 create kt #0 context
+ * 0 0 est_temp_list 0->1 service added, start kthread #0 task
+ * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
+ * 1 0 est_temp_list 1 kt #0: determine est_chain_max,
+ * stop tasks, move ests to est_temp_list
+ * and free kd for kthreads 1..last
+ * 1->0 0->N kt chains 1 ests can go to kthreads
+ * 0 N kt chains 1 drain est_temp_list, create new kthread
+ * contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+ int genid = atomic_read(&ipvs->est_genid);
+ struct ip_vs_est_tick_data *td;
+ struct ip_vs_est_kt_data *kd;
+ struct ip_vs_estimator *est;
+ struct ip_vs_stats *stats;
+ int id, row, cid, delay;
+ bool last, last_td;
+ int chain_max;
+ int step;
+
+ if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+ return;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Stop all other tasks, so that we can immediately move the
+ * estimators to est_temp_list without RCU grace period
+ */
+ mutex_lock(&ipvs->est_mutex);
+ for (id = 1; id < ipvs->est_kt_count; id++) {
+ /* netns clean up started, abort */
+ if (!READ_ONCE(ipvs->enable))
+ goto unlock2;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ continue;
+ ip_vs_est_kthread_stop(kd);
+ }
+ mutex_unlock(&ipvs->est_mutex);
+
+ /* Move all estimators to est_temp_list but carefully,
+ * all estimators and kthread data can be released while
+ * we reschedule. Even for kthread 0.
+ */
+ step = 0;
+
+ /* Order entries in est_temp_list in ascending delay, so now
+ * walk delay(desc), id(desc), cid(asc)
+ */
+ delay = IPVS_EST_NTICKS;
+
+next_delay:
+ delay--;
+ if (delay < 0)
+ goto end_dequeue;
+
+last_kt:
+ /* Destroy contexts backwards */
+ id = ipvs->est_kt_count;
+
+next_kt:
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
+ goto unlock;
+ id--;
+ if (id < 0)
+ goto next_delay;
+ kd = ipvs->est_kt_arr[id];
+ if (!kd)
+ goto next_kt;
+ /* kt 0 can exist with empty chains */
+ if (!id && kd->est_count <= 1)
+ goto next_delay;
+
+ row = kd->est_row + delay;
+ if (row >= IPVS_EST_NTICKS)
+ row -= IPVS_EST_NTICKS;
+ td = rcu_dereference_protected(kd->ticks[row], 1);
+ if (!td)
+ goto next_kt;
+
+ cid = 0;
+
+walk_chain:
+ if (kthread_should_stop())
+ goto unlock;
+ step++;
+ if (!(step & 63)) {
+ /* Give chance estimators to be added (to est_temp_list)
+ * and deleted (releasing kthread contexts)
+ */
+ mutex_unlock(&__ip_vs_mutex);
+ cond_resched();
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Current kt released ? */
+ if (id >= ipvs->est_kt_count)
+ goto last_kt;
+ if (kd != ipvs->est_kt_arr[id])
+ goto next_kt;
+ /* Current td released ? */
+ if (td != rcu_dereference_protected(kd->ticks[row], 1))
+ goto next_kt;
+ /* No fatal changes on the current kd and td */
+ }
+ est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+ list);
+ if (!est) {
+ cid++;
+ if (cid >= IPVS_EST_TICK_CHAINS)
+ goto next_kt;
+ goto walk_chain;
+ }
+ /* We can cheat and increase est_count to protect kt 0 context
+ * from release but we prefer to keep the last estimator
+ */
+ last = kd->est_count <= 1;
+ /* Do not free kt #0 data */
+ if (!id && last)
+ goto next_delay;
+ last_td = kd->tick_len[row] <= 1;
+ stats = container_of(est, struct ip_vs_stats, est);
+ ip_vs_stop_estimator(ipvs, stats);
+ /* Tasks are stopped, move without RCU grace period */
+ est->ktid = -1;
+ est->ktrow = row - kd->est_row;
+ if (est->ktrow < 0)
+ est->ktrow += IPVS_EST_NTICKS;
+ hlist_add_head(&est->list, &ipvs->est_temp_list);
+ /* kd freed ? */
+ if (last)
+ goto next_kt;
+ /* td freed ? */
+ if (last_td)
+ goto next_kt;
+ goto walk_chain;
+
+end_dequeue:
+ /* All estimators removed while calculating ? */
+ if (!ipvs->est_kt_count)
+ goto unlock;
+ kd = ipvs->est_kt_arr[0];
+ if (!kd)
+ goto unlock;
+ kd->add_row = kd->est_row;
+ ipvs->est_chain_max = chain_max;
+ ip_vs_est_set_params(ipvs, kd);
+
+ pr_info("using max %d ests per chain, %d per kthread\n",
+ kd->chain_max, kd->est_max_count);
+
+ /* Try to keep tot_stats in kt0, enqueue it early */
+ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+ ipvs->tot_stats->s.est.ktid == -1) {
+ hlist_del(&ipvs->tot_stats->s.est.list);
+ hlist_add_head(&ipvs->tot_stats->s.est.list,
+ &ipvs->est_temp_list);
+ }
+
+ mutex_lock(&ipvs->est_mutex);
+
+ /* We completed the calc phase, new calc phase not requested */
+ if (genid == atomic_read(&ipvs->est_genid))
+ ipvs->est_calc_phase = 0;
+
+unlock2:
+ mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+ mutex_unlock(&__ip_vs_mutex);
}
void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -190,14 +928,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
- INIT_LIST_HEAD(&ipvs->est_list);
- spin_lock_init(&ipvs->est_lock);
- timer_setup(&ipvs->est_timer, estimation_timer, 0);
- mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ INIT_HLIST_HEAD(&ipvs->est_temp_list);
+ ipvs->est_kt_arr = NULL;
+ ipvs->est_max_threads = 0;
+ ipvs->est_calc_phase = 0;
+ ipvs->est_chain_max = 0;
+ ipvs->est_kt_count = 0;
+ ipvs->est_add_ktid = 0;
+ atomic_set(&ipvs->est_genid, 0);
+ atomic_set(&ipvs->est_genid_done, 0);
+ __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
return 0;
}
void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
{
- del_timer_sync(&ipvs->est_timer);
+ int i;
+
+ for (i = 0; i < ipvs->est_kt_count; i++)
+ ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+ kfree(ipvs->est_kt_arr);
+ mutex_destroy(&ipvs->est_mutex);
}
diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c
index e09874d02938..d657b47c6511 100644
--- a/net/netfilter/ipvs/ip_vs_fo.c
+++ b/net/netfilter/ipvs/ip_vs_fo.c
@@ -1,20 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Weighted Fail Over module
*
* Authors: Kenny Mathis <kmathis@chokepoint.net>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Kenny Mathis : added initial functionality based on weight
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -77,3 +71,4 @@ static void __exit ip_vs_fo_cleanup(void)
module_init(ip_vs_fo_init);
module_exit(ip_vs_fo_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted failover scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 4398a72edec5..b315c608fda4 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_ftp.c: IPVS ftp application module
*
@@ -5,12 +6,6 @@
*
* Changes:
*
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
* is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
*
@@ -19,11 +14,9 @@
* Version: @(#)ip_masq_ftp.c 0.04 02/05/96
*
* Author: Wouter Gadeyne
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/moduleparam.h>
@@ -41,7 +34,7 @@
#include <linux/gfp.h>
#include <net/protocol.h>
#include <net/tcp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/ip_vs.h>
@@ -59,6 +52,7 @@ enum {
IP_VS_FTP_EPSV,
};
+static bool exiting_module;
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
@@ -124,7 +118,7 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
s = data + plen;
if (skip) {
- int found = 0;
+ bool found = false;
for (;; s++) {
if (s == data_limit)
@@ -136,7 +130,7 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (!ext && isdigit(*s))
break;
if (*s == skip)
- found = 1;
+ found = true;
} else if (*s != skip) {
break;
}
@@ -273,7 +267,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
if (cp->app_data == (void *) IP_VS_FTP_PASV) {
@@ -439,7 +433,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
return 1;
/* Linear packets are much easier to deal with. */
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return 0;
data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
@@ -597,8 +591,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %u\n",
- app->name, i, ports[i]);
}
return 0;
@@ -613,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- if (!ipvs)
+ if (!ipvs || !exiting_module)
return;
unregister_ip_vs_app(ipvs, &ip_vs_ftp);
@@ -635,6 +627,7 @@ static int __init ip_vs_ftp_init(void)
*/
static void __exit ip_vs_ftp_exit(void)
{
+ exiting_module = true;
unregister_pernet_subsys(&ip_vs_ftp_ops);
/* rcu_barrier() is called by netns */
}
@@ -643,3 +636,4 @@ static void __exit ip_vs_ftp_exit(void)
module_init(ip_vs_ftp_init);
module_exit(ip_vs_ftp_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs ftp helper");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index b9f375e6dc93..e6c8ed0c92f6 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Locality-Based Least-Connection scheduling module
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Martin Hamilton : fixed the terrible locking bugs
* *lock(tbl->lock) ==> *lock(&tbl->lock)
@@ -18,7 +14,6 @@
* Julian Anastasov : replaced del_timer call with del_timer_sync
* to avoid the possible race between timer
* handler and del_timer thread in SMP
- *
*/
/*
@@ -39,8 +34,7 @@
* me to write this module.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -128,7 +122,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -298,7 +291,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblc_check_expire(struct timer_list *t)
{
- struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -389,7 +383,7 @@ static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
struct ip_vs_lblc_table *tbl = svc->sched_data;
/* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
+ timer_shutdown_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblc_flush(svc);
@@ -555,6 +549,7 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
static int __net_init __ip_vs_lblc_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -568,15 +563,16 @@ static int __net_init __ip_vs_lblc_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblc_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblc_ctl_table = vs_vars_table;
ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
- ipvs->lblc_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ ipvs->lblc_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblc_ctl_table,
+ vars_table_size);
if (!ipvs->lblc_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblc_ctl_table);
@@ -633,3 +629,4 @@ static void __exit ip_vs_lblc_cleanup(void)
module_init(ip_vs_lblc_init);
module_exit(ip_vs_lblc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 542c4949937a..a25cf7bb6185 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -1,17 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Locality-Based Least-Connection with Replication scheduler
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Julian Anastasov : Added the missing (dest->weight>0)
* condition in the ip_vs_dest_set_max.
- *
*/
/*
@@ -37,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/module.h>
@@ -165,7 +159,7 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
/* get weighted least-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
{
- register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *least;
int loh, doh;
@@ -214,7 +208,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* get weighted most-connection node in the destination set */
static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
{
- register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *most;
int moh, doh;
@@ -299,7 +293,6 @@ static struct ctl_table vs_vars_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
#endif
@@ -462,7 +455,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
*/
static void ip_vs_lblcr_check_expire(struct timer_list *t)
{
- struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer);
+ struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t,
+ periodic_timer);
struct ip_vs_service *svc = tbl->svc;
unsigned long now = jiffies;
int goal;
@@ -552,7 +546,7 @@ static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
struct ip_vs_lblcr_table *tbl = svc->sched_data;
/* remove periodic timer */
- del_timer_sync(&tbl->periodic_timer);
+ timer_shutdown_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
ip_vs_lblcr_flush(svc);
@@ -741,6 +735,7 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
static int __net_init __ip_vs_lblcr_init(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ size_t vars_table_size = ARRAY_SIZE(vs_vars_table);
if (!ipvs)
return -ENOENT;
@@ -754,14 +749,15 @@ static int __net_init __ip_vs_lblcr_init(struct net *net)
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- ipvs->lblcr_ctl_table[0].procname = NULL;
+ vars_table_size = 0;
} else
ipvs->lblcr_ctl_table = vs_vars_table;
ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
- ipvs->lblcr_ctl_header =
- register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs",
+ ipvs->lblcr_ctl_table,
+ vars_table_size);
if (!ipvs->lblcr_ctl_header) {
if (!net_eq(net, &init_net))
kfree(ipvs->lblcr_ctl_table);
@@ -818,3 +814,4 @@ static void __exit ip_vs_lblcr_cleanup(void)
module_init(ip_vs_lblcr_init);
module_exit(ip_vs_lblcr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index 19a0769a989a..38cc38c5d8bb 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -1,21 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Least-Connection Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Wensong Zhang : added the ip_vs_lc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -91,3 +85,4 @@ static void __exit ip_vs_lc_cleanup(void)
module_init(ip_vs_lc_init);
module_exit(ip_vs_lc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index 94d9d349ebb0..f61f54004c9e 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -174,8 +173,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
return 0;
}
- table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
- sizeof(unsigned long), GFP_KERNEL);
+ table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -227,7 +225,7 @@ static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
}
out:
- kfree(table);
+ bitmap_free(table);
return 0;
}
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index eb8b9c883889..81974f69e5bb 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_nfct.c: Netfilter connection tracking support for IPVS
*
@@ -7,27 +8,11 @@
* Portions Copyright (C) 2003-2010
* Julian Anastasov
*
- *
- * This code is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- *
* Authors:
* Ben North <ben@redfrontdoor.org>
* Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
* Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
*
- *
* Current status:
*
* - provide conntrack confirmation for new and related connections, by
@@ -43,11 +28,9 @@
* when RELATED conntrack is created from real server (Active FTP DATA)
* - if iptables_nat is not loaded the Passive FTP will not work (the
* PASV response can not be NAT-ed) but Active FTP should work
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/types.h>
@@ -247,7 +230,7 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
__func__, ct, ARG_TUPLE(&exp->tuple));
- nf_ct_expect_related(exp);
+ nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
}
EXPORT_SYMBOL(ip_vs_nfct_expect_related);
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 7d9d4ac596ca..ada158c610ce 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Never Queue scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
/*
@@ -31,8 +26,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -141,3 +135,4 @@ static void __exit ip_vs_nq_cleanup(void)
module_init(ip_vs_nq_init);
module_exit(ip_vs_nq_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs never queue scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c
index f7d62c3b7329..c5c67df80a0b 100644
--- a/net/netfilter/ipvs/ip_vs_ovf.c
+++ b/net/netfilter/ipvs/ip_vs_ovf.c
@@ -1,24 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Overflow-Connection Scheduling module
*
* Authors: Raducu Deaconu <rhadoo_io@yahoo.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Scheduler implements "overflow" loadbalancing according to number of active
- * connections , will keep all conections to the node with the highest weight
+ * connections , will keep all connections to the node with the highest weight
* and overflow to the next node if the number of connections exceeds the node's
* weight.
* Note that this scheduler might not be suitable for UDP because it only uses
* active connections
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -84,3 +78,4 @@ static void __exit ip_vs_ovf_cleanup(void)
module_init(ip_vs_ovf_init);
module_exit(ip_vs_ovf_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs overflow connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 0df17caa8af6..3035079ebd99 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -1,5 +1,5 @@
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
@@ -67,7 +67,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
struct ip_vs_pe *tmp;
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOENT;
mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index d07ef9e31c12..85f31d71e29a 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -1,5 +1,5 @@
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -184,3 +184,4 @@ static void __exit ip_vs_sip_cleanup(void)
module_init(ip_vs_sip_init);
module_exit(ip_vs_sip_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs sip helper");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 54ee84adf0bd..fd9dbca24c85 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -1,20 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_proto.c: transport protocol load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -345,7 +339,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs)
int __init ip_vs_protocol_init(void)
{
- char protocols[64];
+ char protocols[64] = { 0 };
#define REGISTER_PROTOCOL(p) \
do { \
register_ip_vs_protocol(p); \
@@ -353,8 +347,6 @@ int __init ip_vs_protocol_init(void)
strcat(protocols, (p)->name); \
} while (0)
- protocols[0] = '\0';
- protocols[2] = '\0';
#ifdef CONFIG_IP_VS_PROTO_TCP
REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5320d39976e1..44e14acc187e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -1,17 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
*
* Authors: Julian Anastasov <ja@ssi.bg>, February 2002
* Wensong Zhang <wensong@linuxvirtualserver.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation;
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
@@ -129,7 +124,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
.conn_out_get = ah_esp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
- .csum_check = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
@@ -152,7 +146,6 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
.conn_out_get = ah_esp_conn_out_get,
.snat_handler = NULL,
.dnat_handler = NULL,
- .csum_check = NULL,
.state_transition = NULL,
.register_app = NULL,
.unregister_app = NULL,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index b0cd7d08f2a7..83e452916403 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -10,6 +10,9 @@
#include <net/ip_vs.h>
static int
+sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp);
+
+static int
sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
@@ -98,14 +101,14 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!sctp_csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
@@ -123,7 +126,8 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (sctph->source != cp->vport || payload_csum ||
skb->ip_summed == CHECKSUM_PARTIAL) {
sctph->source = cp->vport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
@@ -145,14 +149,14 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
#endif
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!sctp_csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
@@ -171,7 +175,8 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
(skb->ip_summed == CHECKSUM_PARTIAL &&
!(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
sctph->dest = cp->dport;
- sctp_nat_csum(skb, sctph, sctphoff);
+ if (!skb_is_gso(skb))
+ sctp_nat_csum(skb, sctph, sctphoff);
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
@@ -183,7 +188,7 @@ static int
sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
{
unsigned int sctphoff;
- struct sctphdr *sh, _sctph;
+ struct sctphdr *sh;
__le32 cmp, val;
#ifdef CONFIG_IP_VS_IPV6
@@ -193,10 +198,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
#endif
sctphoff = ip_hdrlen(skb);
- sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph);
- if (sh == NULL)
- return 0;
-
+ sh = (struct sctphdr *)(skb->data + sctphoff);
cmp = sh->checksum;
val = sctp_compute_cksum(skb, sctphoff);
@@ -587,7 +589,6 @@ struct ip_vs_protocol ip_vs_protocol_sctp = {
.conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = sctp_snat_handler,
.dnat_handler = sctp_dnat_handler,
- .csum_check = sctp_csum_check,
.state_name = sctp_state_name,
.state_transition = sctp_state_transition,
.app_conn_bind = sctp_app_conn_bind,
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 1770fc6ce960..f68a1533ee45 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_proto_tcp.c: TCP load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
*
* Network name space (netns) aware.
@@ -17,8 +13,7 @@
* protocol ip_vs_proto_data and is handled by netns
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -28,10 +23,14 @@
#include <net/ip6_checksum.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/ip_vs.h>
static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp);
+
+static int
tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
@@ -143,14 +142,14 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph,
}
-static int
+INDIRECT_CALLABLE_SCOPE int
tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct tcphdr *tcph;
unsigned int tcphoff = iph->len;
+ bool payload_csum = false;
int oldlen;
- int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -159,14 +158,14 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!tcp_csum_check(cp->af, skb, pp))
return 0;
/* Call application helper if needed */
@@ -176,7 +175,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (ret == 1)
oldlen = skb->len - tcphoff;
else
- payload_csum = 1;
+ payload_csum = true;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -192,7 +191,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = (cp->app && pp->csum_check) ?
+ skb->ip_summed = cp->app ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
@@ -227,8 +226,8 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
struct tcphdr *tcph;
unsigned int tcphoff = iph->len;
+ bool payload_csum = false;
int oldlen;
- int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -237,14 +236,14 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - tcphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!tcp_csum_check(cp->af, skb, pp))
return 0;
/*
@@ -257,7 +256,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (ret == 1)
oldlen = skb->len - tcphoff;
else
- payload_csum = 1;
+ payload_csum = true;
}
tcph = (void *)skb_network_header(skb) + tcphoff;
@@ -275,7 +274,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = (cp->app && pp->csum_check) ?
+ skb->ip_summed = cp->app ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
@@ -315,7 +314,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -539,8 +538,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
- IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
- "%s:%d state: %s->%s conn->refcnt:%d\n",
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
+ "d:%s:%d state: %s->%s conn->refcnt:%d\n",
pd->pp->name,
((state_off == TCP_DIR_OUTPUT) ?
"output " : "input "),
@@ -548,10 +547,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
th->fin ? 'F' : '.',
th->ack ? 'A' : '.',
th->rst ? 'R' : '.',
- IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
- ntohs(cp->dport),
IP_VS_DBG_ADDR(cp->af, &cp->caddr),
ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
tcp_state_name(cp->state),
tcp_state_name(new_state),
refcount_read(&cp->refcnt));
@@ -710,7 +711,7 @@ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd
sizeof(tcp_timeouts));
if (!pd->timeout_table)
return -ENOMEM;
- pd->tcp_state_table = tcp_states;
+ pd->tcp_state_table = tcp_states;
return 0;
}
@@ -736,7 +737,6 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = tcp_snat_handler,
.dnat_handler = tcp_dnat_handler,
- .csum_check = tcp_csum_check,
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 0f53c49025f8..0f0107c80dd2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -1,21 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_proto_udp.c: UDP load balancing support for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
* Network name space (netns) aware.
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/in.h>
#include <linux/ip.h>
@@ -23,12 +17,16 @@
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/udp.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/ip_vs.h>
#include <net/ip.h>
#include <net/ip6_checksum.h>
static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp);
+
+static int
udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp,
@@ -133,14 +131,14 @@ udp_partial_csum_update(int af, struct udphdr *uhdr,
}
-static int
+INDIRECT_CALLABLE_SCOPE int
udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
{
struct udphdr *udph;
unsigned int udphoff = iph->len;
+ bool payload_csum = false;
int oldlen;
- int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -149,14 +147,14 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!udp_csum_check(cp->af, skb, pp))
return 0;
/*
@@ -168,7 +166,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (ret == 1)
oldlen = skb->len - udphoff;
else
- payload_csum = 1;
+ payload_csum = true;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -186,7 +184,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = (cp->app && pp->csum_check) ?
+ skb->ip_summed = cp->app ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
@@ -222,8 +220,8 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
{
struct udphdr *udph;
unsigned int udphoff = iph->len;
+ bool payload_csum = false;
int oldlen;
- int payload_csum = 0;
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6 && iph->fragoffs)
@@ -232,14 +230,14 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
oldlen = skb->len - udphoff;
/* csum_check requires unshared skb */
- if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ if (skb_ensure_writable(skb, udphoff + sizeof(*udph)))
return 0;
if (unlikely(cp->app != NULL)) {
int ret;
/* Some checks before mangling */
- if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ if (!udp_csum_check(cp->af, skb, pp))
return 0;
/*
@@ -252,7 +250,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
if (ret == 1)
oldlen = skb->len - udphoff;
else
- payload_csum = 1;
+ payload_csum = true;
}
udph = (void *)skb_network_header(skb) + udphoff;
@@ -270,7 +268,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = (cp->app && pp->csum_check) ?
+ skb->ip_summed = cp->app ?
CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
} else {
/* full checksum calculation */
@@ -319,7 +317,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
case CHECKSUM_NONE:
skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
- /* fall through */
+ fallthrough;
case CHECKSUM_COMPLETE:
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -494,7 +492,6 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.conn_out_get = ip_vs_conn_out_get_proto,
.snat_handler = udp_snat_handler,
.dnat_handler = udp_dnat_handler,
- .csum_check = udp_csum_check,
.state_transition = udp_state_transition,
.state_name = udp_state_name,
.register_app = udp_register_app,
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index ee0530d14c5f..4125ee561cdc 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Round-Robin Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes/Changes:
* Wensong Zhang : changed the ip_vs_rr_schedule to return dest
* Julian Anastasov : fixed the NULL pointer access bug in debugging
@@ -16,11 +12,9 @@
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_rr_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -127,4 +121,5 @@ static void __exit ip_vs_rr_cleanup(void)
module_init(ip_vs_rr_init);
module_exit(ip_vs_rr_cleanup);
+MODULE_DESCRIPTION("ipvs round-robin scheduler");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index a2ff7d746ebf..c6e421c4e299 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
@@ -8,17 +9,10 @@
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
@@ -184,7 +178,8 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
}
/* increase the module use count */
- ip_vs_use_count_inc();
+ if (!ip_vs_use_count_inc())
+ return -ENOENT;
mutex_lock(&ip_vs_sched_mutex);
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index ab23cf203437..245a323c84cd 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Shortest Expected Delay scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
/*
@@ -35,8 +30,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -142,3 +136,4 @@ static void __exit ip_vs_sed_cleanup(void)
module_init(ip_vs_sed_init);
module_exit(ip_vs_sed_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs shortest expected delay scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 1e01c782583a..0e85e07e23b9 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Source Hashing scheduling module
*
* Authors: Wensong Zhang <wensong@gnuchina.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
- *
*/
/*
@@ -37,8 +32,7 @@
*
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/ip.h>
#include <linux/slab.h>
@@ -381,3 +375,4 @@ static void __exit ip_vs_sh_cleanup(void)
module_init(ip_vs_sh_init);
module_exit(ip_vs_sh_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs source hashing scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2526be6b3d90..54dd1514ac45 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -32,8 +32,7 @@
* Persistence support, fwmark and time-out.
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/slab.h>
@@ -51,7 +50,7 @@
#include <linux/kernel.h>
#include <linux/sched/signal.h>
-#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+#include <linux/unaligned.h> /* Used for ntoh_seq and hton_seq */
#include <net/ip.h>
#include <net/sock.h>
@@ -195,6 +194,7 @@ union ip_vs_sync_conn {
#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
struct ip_vs_sync_thread_data {
+ struct task_struct *task;
struct netns_ipvs *ipvs;
struct socket *sock;
char *buf;
@@ -241,9 +241,6 @@ struct ip_vs_sync_thread_data {
| IPVS Sync Connection (1) |
*/
-#define SYNC_MESG_HEADER_LEN 4
-#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
-
/* Version 0 header */
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
@@ -374,8 +371,11 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs,
max(IPVS_SYNC_SEND_DELAY, 1));
ms->sync_queue_len++;
list_add_tail(&sb->list, &ms->sync_queue);
- if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
- wake_up_process(ms->master_thread);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) {
+ int id = (int)(ms - ipvs->ms);
+
+ wake_up_process(ipvs->master_tinfo[id].task);
+ }
} else
ip_vs_sync_buff_release(sb);
spin_unlock(&ipvs->sync_lock);
@@ -602,7 +602,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
struct ip_vs_sync_conn_options *opt =
(struct ip_vs_sync_conn_options *)&s[1];
- memcpy(opt, &cp->in_seq, sizeof(*opt));
+ memcpy(opt, &cp->sync_conn_opt, sizeof(*opt));
}
m->nr_conns++;
@@ -614,7 +614,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
cp = cp->control;
if (cp) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
ip_vs_sync_conn(ipvs, cp, pkts);
@@ -775,7 +775,7 @@ control:
if (!cp)
return;
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
- pkts = atomic_add_return(1, &cp->in_pkts);
+ pkts = atomic_inc_return(&cp->in_pkts);
else
pkts = sysctl_sync_threshold(ipvs);
goto sloop;
@@ -1235,7 +1235,7 @@ static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer,
p = msg_end;
if (p + sizeof(s->v4) > buffer+buflen) {
- IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, too small\n");
return;
}
s = (union ip_vs_sync_conn *)p;
@@ -1279,12 +1279,12 @@ static void set_sock_size(struct sock *sk, int mode, int val)
lock_sock(sk);
if (mode) {
val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
- sysctl_wmem_max);
+ READ_ONCE(sysctl_wmem_max));
sk->sk_sndbuf = val * 2;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
} else {
val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
- sysctl_rmem_max);
+ READ_ONCE(sysctl_rmem_max));
sk->sk_rcvbuf = val * 2;
sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
}
@@ -1296,20 +1296,14 @@ static void set_sock_size(struct sock *sk, int mode, int val)
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
{
- struct inet_sock *inet = inet_sk(sk);
-
/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
- lock_sock(sk);
- inet->mc_loop = loop ? 1 : 0;
+ inet_assign_bit(MC_LOOP, sk, loop);
#ifdef CONFIG_IP_VS_IPV6
- if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
-
+ if (READ_ONCE(sk->sk_family) == AF_INET6) {
/* IPV6_MULTICAST_LOOP */
- np->mc_loop = loop ? 1 : 0;
+ inet6_assign_bit(MC6_LOOP, sk, loop);
}
#endif
- release_sock(sk);
}
/*
@@ -1321,13 +1315,13 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
- inet->mc_ttl = ttl;
+ WRITE_ONCE(inet->mc_ttl, ttl);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_HOPS */
- np->mcast_hops = ttl;
+ WRITE_ONCE(np->mcast_hops, ttl);
}
#endif
release_sock(sk);
@@ -1340,13 +1334,13 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MTU_DISCOVER */
- np->pmtudisc = val;
+ WRITE_ONCE(np->pmtudisc, val);
}
#endif
release_sock(sk);
@@ -1370,7 +1364,7 @@ static int set_mcast_if(struct sock *sk, struct net_device *dev)
struct ipv6_pinfo *np = inet6_sk(sk);
/* IPV6_MULTICAST_IF */
- np->mcast_oif = dev->ifindex;
+ WRITE_ONCE(np->mcast_oif, dev->ifindex);
}
#endif
release_sock(sk);
@@ -1440,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev)
sin.sin_addr.s_addr = addr;
sin.sin_port = 0;
- return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+ return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin));
}
static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen,
@@ -1506,8 +1500,8 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id,
}
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id);
- result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
- salen, 0);
+ result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr,
+ salen, 0);
if (result < 0) {
pr_err("Error connecting to the multicast addr\n");
goto error;
@@ -1547,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id,
get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id);
sock->sk->sk_bound_dev_if = dev->ifindex;
- result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen);
+ result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen);
if (result < 0) {
pr_err("Error binding to the multicast addr\n");
goto error;
@@ -1581,13 +1575,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
struct kvec iov;
int len;
- EnterFunction(7);
iov.iov_base = (void *)buffer;
iov.iov_len = length;
len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
- LeaveFunction(7);
return len;
}
@@ -1613,15 +1605,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
struct kvec iov = {buffer, buflen};
int len;
- EnterFunction(7);
-
/* Receive a packet */
- iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, buflen);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen);
len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
if (len < 0)
return len;
- LeaveFunction(7);
return len;
}
@@ -1636,8 +1625,10 @@ static void master_wakeup_work_handler(struct work_struct *work)
spin_lock_bh(&ipvs->sync_lock);
if (ms->sync_queue_len &&
ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ int id = (int)(ms - ipvs->ms);
+
ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
- wake_up_process(ms->master_thread);
+ wake_up_process(ipvs->master_tinfo[id].task);
}
spin_unlock_bh(&ipvs->sync_lock);
}
@@ -1703,10 +1694,6 @@ done:
if (sb)
ip_vs_sync_buff_release(sb);
- /* release the sending multicast socket */
- sock_release(tinfo->sock);
- kfree(tinfo);
-
return 0;
}
@@ -1715,6 +1702,8 @@ static int sync_thread_backup(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = tinfo->ipvs;
+ struct sock *sk = tinfo->sock->sk;
+ struct udp_sock *up = udp_sk(sk);
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
@@ -1722,12 +1711,14 @@ static int sync_thread_backup(void *data)
ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id);
while (!kthread_should_stop()) {
- wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
- !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
- || kthread_should_stop());
+ wait_event_interruptible(*sk_sleep(sk),
+ !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue) ||
+ kthread_should_stop());
/* do we have data now? */
- while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ while (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+ !skb_queue_empty_lockless(&up->reader_queue)) {
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->bcfg.sync_maxlen);
if (len <= 0) {
@@ -1740,11 +1731,6 @@ static int sync_thread_backup(void *data)
}
}
- /* release the sending multicast socket */
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
-
return 0;
}
@@ -1752,8 +1738,8 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
int state)
{
- struct ip_vs_sync_thread_data *tinfo = NULL;
- struct task_struct **array = NULL, *task;
+ struct ip_vs_sync_thread_data *ti = NULL, *tinfo;
+ struct task_struct *task;
struct net_device *dev;
char *name;
int (*threadfn)(void *data);
@@ -1765,6 +1751,10 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ /* increase the module use count */
+ if (!ip_vs_use_count_inc())
+ return -ENOPROTOOPT;
+
/* Do not hold one mutex and then to block on another */
for (;;) {
rtnl_lock();
@@ -1822,7 +1812,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
threadfn = sync_thread_master;
} else if (state == IP_VS_STATE_BACKUP) {
result = -EEXIST;
- if (ipvs->backup_threads)
+ if (ipvs->backup_tinfo)
goto out_early;
ipvs->bcfg = *c;
@@ -1849,28 +1839,22 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
master_wakeup_work_handler);
ms->ipvs = ipvs;
}
- } else {
- array = kcalloc(count, sizeof(struct task_struct *),
- GFP_KERNEL);
- result = -ENOMEM;
- if (!array)
- goto out;
}
+ result = -ENOMEM;
+ ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data),
+ GFP_KERNEL);
+ if (!ti)
+ goto out;
for (id = 0; id < count; id++) {
- result = -ENOMEM;
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto out;
+ tinfo = &ti[id];
tinfo->ipvs = ipvs;
- tinfo->sock = NULL;
if (state == IP_VS_STATE_BACKUP) {
+ result = -ENOMEM;
tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen,
GFP_KERNEL);
if (!tinfo->buf)
goto out;
- } else {
- tinfo->buf = NULL;
}
tinfo->id = id;
if (state == IP_VS_STATE_MASTER)
@@ -1885,17 +1869,15 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
result = PTR_ERR(task);
goto out;
}
- tinfo = NULL;
- if (state == IP_VS_STATE_MASTER)
- ipvs->ms[id].master_thread = task;
- else
- array[id] = task;
+ tinfo->task = task;
}
/* mark as active */
- if (state == IP_VS_STATE_BACKUP)
- ipvs->backup_threads = array;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->master_tinfo = ti;
+ else
+ ipvs->backup_tinfo = ti;
spin_lock_bh(&ipvs->sync_buff_lock);
ipvs->sync_state |= state;
spin_unlock_bh(&ipvs->sync_buff_lock);
@@ -1903,56 +1885,64 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c,
mutex_unlock(&ipvs->sync_mutex);
rtnl_unlock();
- /* increase the module use count */
- ip_vs_use_count_inc();
-
return 0;
out:
/* We do not need RTNL lock anymore, release it here so that
- * sock_release below and in the kthreads can use rtnl_lock
- * to leave the mcast group.
+ * sock_release below can use rtnl_lock to leave the mcast group.
*/
rtnl_unlock();
- count = id;
- while (count-- > 0) {
- if (state == IP_VS_STATE_MASTER)
- kthread_stop(ipvs->ms[count].master_thread);
- else
- kthread_stop(array[count]);
+ id = min(id, count - 1);
+ if (ti) {
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->task)
+ kthread_stop(tinfo->task);
+ }
}
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
kfree(ipvs->ms);
ipvs->ms = NULL;
}
mutex_unlock(&ipvs->sync_mutex);
- if (tinfo) {
- if (tinfo->sock)
- sock_release(tinfo->sock);
- kfree(tinfo->buf);
- kfree(tinfo);
+
+ /* No more mutexes, release socks */
+ if (ti) {
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ }
+ kfree(ti);
}
- kfree(array);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
return result;
out_early:
mutex_unlock(&ipvs->sync_mutex);
rtnl_unlock();
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
return result;
}
int stop_sync_thread(struct netns_ipvs *ipvs, int state)
{
- struct task_struct **array;
+ struct ip_vs_sync_thread_data *ti, *tinfo;
int id;
int retc = -EINVAL;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+ mutex_lock(&ipvs->sync_mutex);
if (state == IP_VS_STATE_MASTER) {
+ retc = -ESRCH;
if (!ipvs->ms)
- return -ESRCH;
+ goto err;
+ ti = ipvs->master_tinfo;
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
@@ -1971,38 +1961,56 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state)
struct ipvs_master_sync_state *ms = &ipvs->ms[id];
int ret;
+ tinfo = &ti[id];
pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(ms->master_thread));
+ task_pid_nr(tinfo->task));
cancel_delayed_work_sync(&ms->master_wakeup_work);
- ret = kthread_stop(ms->master_thread);
+ ret = kthread_stop(tinfo->task);
if (retc >= 0)
retc = ret;
}
kfree(ipvs->ms);
ipvs->ms = NULL;
+ ipvs->master_tinfo = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!ipvs->backup_threads)
- return -ESRCH;
+ retc = -ESRCH;
+ if (!ipvs->backup_tinfo)
+ goto err;
+ ti = ipvs->backup_tinfo;
ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
- array = ipvs->backup_threads;
retc = 0;
for (id = ipvs->threads_mask; id >= 0; id--) {
int ret;
+ tinfo = &ti[id];
pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(array[id]));
- ret = kthread_stop(array[id]);
+ task_pid_nr(tinfo->task));
+ ret = kthread_stop(tinfo->task);
if (retc >= 0)
retc = ret;
}
- kfree(array);
- ipvs->backup_threads = NULL;
+ ipvs->backup_tinfo = NULL;
+ } else {
+ goto err;
}
+ id = ipvs->threads_mask;
+ mutex_unlock(&ipvs->sync_mutex);
+
+ /* No more mutexes, release socks */
+ for (tinfo = ti + id; tinfo >= ti; tinfo--) {
+ if (tinfo->sock)
+ sock_release(tinfo->sock);
+ kfree(tinfo->buf);
+ }
+ kfree(ti);
/* decrease the module use count */
ip_vs_use_count_dec();
+ return retc;
+err:
+ mutex_unlock(&ipvs->sync_mutex);
return retc;
}
@@ -2021,7 +2029,6 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
{
int retc;
- mutex_lock(&ipvs->sync_mutex);
retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Master Daemon\n");
@@ -2029,5 +2036,4 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs)
retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP);
if (retc && retc != -ESRCH)
pr_err("Failed to stop Backup Daemon\n");
- mutex_unlock(&ipvs->sync_mutex);
}
diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c
new file mode 100644
index 000000000000..dbb7f5fd4688
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_twos.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* IPVS: Power of Twos Choice Scheduling module
+ *
+ * Authors: Darby Payne <darby.payne@applovin.com>
+ */
+
+#define pr_fmt(fmt) "IPVS: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+/* Power of Twos Choice scheduling, algorithm originally described by
+ * Michael Mitzenmacher.
+ *
+ * Randomly picks two destinations and picks the one with the least
+ * amount of connections
+ *
+ * The algorithm calculates a few variables
+ * - total_weight = sum of all weights
+ * - rweight1 = random number between [0,total_weight]
+ * - rweight2 = random number between [0,total_weight]
+ *
+ * For each destination
+ * decrement rweight1 and rweight2 by the destination weight
+ * pick choice1 when rweight1 is <= 0
+ * pick choice2 when rweight2 is <= 0
+ *
+ * Return choice2 if choice2 has less connections than choice 1 normalized
+ * by weight
+ *
+ * References
+ * ----------
+ *
+ * [Mitzenmacher 2016]
+ * The Power of Two Random Choices: A Survey of Techniques and Results
+ * Michael Mitzenmacher, Andrea W. Richa y, Ramesh Sitaraman
+ * http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/twosurvey.pdf
+ *
+ */
+static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc,
+ const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *choice1 = NULL, *choice2 = NULL;
+ int rweight1, rweight2, weight1 = -1, weight2 = -1, overhead1 = 0;
+ int overhead2, total_weight = 0, weight;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* Generate a random weight between [0,sum of all weights) */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD)) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ total_weight += weight;
+ choice1 = dest;
+ }
+ }
+ }
+
+ if (!choice1) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Add 1 to total_weight so that the random weights are inclusive
+ * from 0 to total_weight
+ */
+ total_weight += 1;
+ rweight1 = get_random_u32_below(total_weight);
+ rweight2 = get_random_u32_below(total_weight);
+
+ /* Pick two weighted servers */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ weight = atomic_read(&dest->weight);
+ if (weight <= 0)
+ continue;
+
+ rweight1 -= weight;
+ rweight2 -= weight;
+
+ if (rweight1 <= 0 && weight1 == -1) {
+ choice1 = dest;
+ weight1 = weight;
+ overhead1 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (rweight2 <= 0 && weight2 == -1) {
+ choice2 = dest;
+ weight2 = weight;
+ overhead2 = ip_vs_dest_conn_overhead(dest);
+ }
+
+ if (weight1 != -1 && weight2 != -1)
+ goto nextstage;
+ }
+
+nextstage:
+ if (choice2 && (weight2 * overhead1) > (weight1 * overhead2))
+ choice1 = choice2;
+
+ IP_VS_DBG_BUF(6, "twos: server %s:%u conns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(choice1->af, &choice1->addr),
+ ntohs(choice1->port), atomic_read(&choice1->activeconns),
+ refcount_read(&choice1->refcnt),
+ atomic_read(&choice1->weight));
+
+ return choice1;
+}
+
+static struct ip_vs_scheduler ip_vs_twos_scheduler = {
+ .name = "twos",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_twos_scheduler.n_list),
+ .schedule = ip_vs_twos_schedule,
+};
+
+static int __init ip_vs_twos_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_twos_scheduler);
+}
+
+static void __exit ip_vs_twos_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_twos_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_twos_init);
+module_exit(ip_vs_twos_cleanup);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs power of twos choice scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index 6add39e0ec20..9da445ca09a1 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Weighted Least-Connection Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
* Wensong Zhang : changed to use the inactconns in scheduling
@@ -16,11 +12,9 @@
* Wensong Zhang : changed for the d-linked destination list
* Wensong Zhang : added the ip_vs_wlc_update_svc
* Wensong Zhang : added any dest with weight=0 is quiesced
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -114,3 +108,4 @@ static void __exit ip_vs_wlc_cleanup(void)
module_init(ip_vs_wlc_init);
module_exit(ip_vs_wlc_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted least connection scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 62258dd457ac..99f09cbf2d9b 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPVS: Weighted Round-Robin Scheduling module
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
* Wensong Zhang : changed some comestics things for debugging
@@ -15,11 +11,9 @@
* Wensong Zhang : added the ip_vs_wrr_update_svc
* Julian Anastasov : fixed the bug of returning destination
* with weight 0 when all weights are zero
- *
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -268,3 +262,4 @@ static void __exit ip_vs_wrr_cleanup(void)
module_init(ip_vs_wrr_init);
module_exit(ip_vs_wrr_cleanup);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ipvs weighted round-robin scheduler");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 473cce2a5231..3162ce3c2640 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip_vs_xmit.c: various packet transmitters for IPVS
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Julian Anastasov <ja@ssi.bg>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
*
* Description of forwarding methods:
@@ -25,13 +21,14 @@
* - the only place where we can see skb->sk != NULL
*/
-#define KMSG_COMPONENT "IPVS"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "IPVS: " fmt
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
+#include <net/gue.h>
+#include <net/gre.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
@@ -39,6 +36,7 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
@@ -98,7 +96,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
if (!dest_dst)
return NULL;
dst = dest_dst->dst_cache;
- if (dst->obsolete &&
+ if (READ_ONCE(dst->obsolete) &&
dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
return dest_dst;
@@ -120,13 +118,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
return false;
}
-/* Get route to daddr, update *saddr, optionally bind route to saddr */
+/* Get route to daddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *ret_saddr)
{
struct flowi4 fl4;
struct rtable *rt;
- int loop = 0;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -136,23 +133,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
retry:
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
- /* Invalid saddr ? */
- if (PTR_ERR(rt) == -EINVAL && *saddr &&
- rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
- *saddr = 0;
- flowi4_update_output(&fl4, 0, 0, daddr, 0);
- goto retry;
- }
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
return NULL;
- } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ }
+ if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
- *saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
- loop++;
+ flowi4_update_output(&fl4, 0, daddr, fl4.saddr);
+ rt_mode = 0;
goto retry;
}
- *saddr = fl4.saddr;
+ if (ret_saddr)
+ *ret_saddr = fl4.saddr;
return rt;
}
@@ -181,7 +172,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
(addr_type & IPV6_ADDR_LOOPBACK);
old_rt_is_local = __ip_vs_is_local_route6(
- (struct rt6_info *)skb_dst(skb));
+ dst_rt6_info(skb_dst(skb)));
} else
#endif
{
@@ -209,7 +200,7 @@ static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
struct rtable *ort = skb_rtable(skb);
if (!skb->dev && sk && sk_fullsock(sk))
- ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
}
static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
@@ -272,13 +263,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return false;
ipv6_hdr(skb)->hop_limit--;
@@ -287,13 +278,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
{
if (ip_hdr(skb)->ttl <= 1) {
/* Tell the sender its packet died... */
- __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
return false;
}
/* don't propagate ttl change to cloned packets */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
/* Decrease ttl */
@@ -319,7 +310,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rtable *) dest_dst->dst_cache;
+ rt = dst_rtable(dest_dst->dst_cache);
else {
dest_dst = ip_vs_dest_dst_alloc();
spin_lock_bh(&dest->dst_lock);
@@ -340,24 +331,20 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest_dst->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.ip;
} else {
- __be32 saddr = htonl(INADDR_ANY);
-
noref = 0;
/* For such unconfigured boxes avoid many route lookups
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, ret_saddr);
if (!rt)
goto err_unreach;
- if (ret_saddr)
- *ret_saddr = saddr;
}
local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
@@ -382,6 +369,21 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
} else {
mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (!dest)
+ goto err_put;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
+
+ if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
+ mtu -= gre_calc_hlen(tflags);
+ }
if (mtu < 68) {
IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
goto err_put;
@@ -393,12 +395,9 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
goto err_put;
skb_dst_drop(skb);
- if (noref) {
- if (!local)
- skb_dst_set_noref(skb, &rt->dst);
- else
- skb_dst_set(skb, dst_clone(&rt->dst));
- } else
+ if (noref)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
skb_dst_set(skb, &rt->dst);
return local;
@@ -470,7 +469,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
if (likely(dest_dst))
- rt = (struct rt6_info *) dest_dst->dst_cache;
+ rt = dst_rt6_info(dest_dst->dst_cache);
else {
u32 cookie;
@@ -490,13 +489,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
ip_vs_dest_dst_free(dest_dst);
goto err_unreach;
}
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
cookie = rt6_get_cookie(rt);
__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest_dst->dst_saddr.in6,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__rcuref));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.in6;
@@ -506,7 +505,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
rt_mode);
if (!dst)
goto err_unreach;
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
}
local = __ip_vs_is_local_route6(rt);
@@ -533,6 +532,21 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
mtu = dst_mtu(&rt->dst);
else {
mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (!dest)
+ goto err_put;
+ if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
+ if ((dest->tun_flags &
+ IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL)
+ mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
+
+ if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
+ mtu -= gre_calc_hlen(tflags);
+ }
if (mtu < IPV6_MIN_MTU) {
IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
IPV6_MIN_MTU);
@@ -545,12 +559,9 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
goto err_put;
skb_dst_drop(skb);
- if (noref) {
- if (!local)
- skb_dst_set_noref(skb, &rt->dst);
- else
- skb_dst_set(skb, dst_clone(&rt->dst));
- } else
+ if (noref)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
skb_dst_set(skb, &rt->dst);
return local;
@@ -584,8 +595,10 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
ret = ip_vs_confirm_conntrack(skb);
if (ret == NF_ACCEPT) {
- nf_reset(skb);
+ nf_reset_ct(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
}
return ret;
}
@@ -626,6 +639,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -646,6 +661,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
+ if (skb->dev)
+ skb_clear_tstamp(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -677,8 +694,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct iphdr *iph = ip_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
goto tx_error;
@@ -690,12 +705,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -706,8 +719,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- EnterFunction(10);
-
if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
&iph->daddr, NULL,
ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
@@ -718,12 +729,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -739,8 +748,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rtable *rt; /* Route to the other host */
int local, rc, was_input;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
@@ -787,7 +794,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -810,12 +817,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -827,8 +832,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct rt6_info *rt; /* Route to the other host */
int local, rc;
- EnterFunction(10);
-
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
@@ -847,7 +850,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_RT_MODE_RDR);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -876,7 +879,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -898,11 +901,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- LeaveFunction(10);
return rc;
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
return NF_STOLEN;
}
@@ -965,7 +966,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
old_dsfield = ipv4_get_dsfield(old_iph);
*ttl = old_iph->ttl;
if (payload_len)
- *payload_len = ntohs(old_iph->tot_len);
+ *payload_len = skb_ip_totlen(skb);
}
/* Implement full-functionality option for ECN encapsulation */
@@ -989,6 +990,98 @@ static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
}
}
+static int
+ipvs_gue_encap(struct net *net, struct sk_buff *skb,
+ struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+ __be16 dport;
+ __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
+ struct udphdr *udph; /* Our new UDP header */
+ struct guehdr *gueh; /* Our new GUE header */
+ size_t hdrlen, optlen = 0;
+ void *data;
+ bool need_priv = false;
+
+ if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ need_priv = true;
+ }
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ gueh = (struct guehdr *)skb->data;
+
+ gueh->control = 0;
+ gueh->version = 0;
+ gueh->hlen = optlen >> 2;
+ gueh->flags = 0;
+ gueh->proto_ctype = *next_protocol;
+
+ data = &gueh[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd;
+
+ gueh->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd = data;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ udph = udp_hdr(skb);
+
+ dport = cp->dest->tun_port;
+ udph->dest = dport;
+ udph->source = sport;
+ udph->len = htons(skb->len);
+ udph->check = 0;
+
+ *next_protocol = IPPROTO_UDP;
+
+ return 0;
+}
+
+static void
+ipvs_gre_encap(struct net *net, struct sk_buff *skb,
+ struct ip_vs_conn *cp, __u8 *next_protocol)
+{
+ __be16 proto = *next_protocol == IPPROTO_IPIP ?
+ htons(ETH_P_IP) : htons(ETH_P_IPV6);
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
+ size_t hdrlen;
+
+ if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
+
+ hdrlen = gre_calc_hlen(tflags);
+ gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
+
+ *next_protocol = IPPROTO_GRE;
+}
+
/*
* IP Tunneling transmitter
*
@@ -1025,8 +1118,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
-
- EnterFunction(10);
+ int tun_type, gso_type;
+ int tun_flags;
local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
@@ -1046,19 +1139,78 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+ tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
+ size_t gre_hdrlen;
+
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
+ gre_hdrlen = gre_calc_hlen(tflags);
+
+ max_headroom += gre_hdrlen;
+ }
+
/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, NULL, &dsfield,
&ttl, dfp);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)))
+ gso_type = __tun_gso_type_mask(AF_INET, cp->af);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ gso_type |= SKB_GSO_GRE_CSUM;
+ else
+ gso_type |= SKB_GSO_GRE;
+ }
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
skb->transport_header = skb->network_header;
+ skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+ ipvs_gre_encap(net, skb, cp, &next_protocol);
+
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1086,14 +1238,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
@@ -1102,6 +1250,8 @@ int
ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
+ struct netns_ipvs *ipvs = cp->ipvs;
+ struct net *net = ipvs->net;
struct rt6_info *rt; /* Route to the other host */
struct in6_addr saddr; /* Source for tunnel */
struct net_device *tdev; /* Device to other host */
@@ -1112,10 +1262,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int ret, local;
+ int tun_type, gso_type;
+ int tun_flags;
- EnterFunction(10);
-
- local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
+ local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
&saddr, ipvsh, 1,
IP_VS_RT_MODE_LOCAL |
@@ -1126,7 +1276,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (local)
return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
tdev = rt->dst.dev;
/*
@@ -1134,17 +1284,76 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+ tun_type = cp->dest->tun_type;
+ tun_flags = cp->dest->tun_flags;
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ size_t gue_hdrlen, gue_optlen = 0;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
+ }
+ gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
+
+ max_headroom += sizeof(struct udphdr) + gue_hdrlen;
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
+ size_t gre_hdrlen;
+
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ __set_bit(IP_TUNNEL_CSUM_BIT, tflags);
+ gre_hdrlen = gre_calc_hlen(tflags);
+
+ max_headroom += gre_hdrlen;
+ }
+
skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
&next_protocol, &payload_len,
&dsfield, &ttl, NULL);
if (IS_ERR(skb))
- goto tx_error;
+ return NF_STOLEN;
- if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)))
+ gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ else
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ gso_type |= SKB_GSO_TUNNEL_REMCSUM;
+ }
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+ if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
+ gso_type |= SKB_GSO_GRE_CSUM;
+ else
+ gso_type |= SKB_GSO_GRE;
+ }
+
+ if (iptunnel_handle_offloads(skb, gso_type))
goto tx_error;
skb->transport_header = skb->network_header;
+ skb_set_inner_ipproto(skb, next_protocol);
+ skb_set_inner_mac_header(skb, skb_inner_network_offset(skb));
+
+ if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
+ bool check = false;
+
+ if (ipvs_gue_encap(net, skb, cp, &next_protocol))
+ goto tx_error;
+
+ if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
+ (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
+ check = true;
+
+ udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
+ } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
+ ipvs_gre_encap(net, skb, cp, &next_protocol);
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1167,18 +1376,14 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
- ip6_local_out(cp->ipvs->net, skb->sk, skb);
+ ip6_local_out(net, skb->sk, skb);
else if (ret == NF_DROP)
kfree_skb(skb);
- LeaveFunction(10);
-
return NF_STOLEN;
tx_error:
- if (!IS_ERR(skb))
- kfree_skb(skb);
- LeaveFunction(10);
+ kfree_skb(skb);
return NF_STOLEN;
}
#endif
@@ -1194,8 +1399,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
IP_VS_RT_MODE_LOCAL |
IP_VS_RT_MODE_NON_LOCAL |
@@ -1212,12 +1415,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
@@ -1228,8 +1429,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
{
int local;
- EnterFunction(10);
-
local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
&cp->daddr.in6,
NULL, ipvsh, 0,
@@ -1246,12 +1445,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
- LeaveFunction(10);
return NF_STOLEN;
tx_error:
kfree_skb(skb);
- LeaveFunction(10);
return NF_STOLEN;
}
#endif
@@ -1271,8 +1468,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode, was_input;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1283,7 +1478,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1328,7 +1523,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1339,14 +1534,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
- out:
- LeaveFunction(10);
return rc;
}
@@ -1361,8 +1553,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
int local;
int rt_mode;
- EnterFunction(10);
-
/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
forwarded directly here, because there is no need to
translate address/port back */
@@ -1373,7 +1563,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rc = NF_ACCEPT;
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
- goto out;
+ return rc;
}
/*
@@ -1388,7 +1578,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
&cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
if (local < 0)
goto tx_error;
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -1417,7 +1607,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
/* copy-on-write the packet before mangling it */
- if (!skb_make_writable(skb, offset))
+ if (skb_ensure_writable(skb, offset))
goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
@@ -1428,14 +1618,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->ignore_df = 1;
- rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
- goto out;
+ return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
tx_error:
kfree_skb(skb);
rc = NF_STOLEN;
-out:
- LeaveFunction(10);
return rc;
}
#endif
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..46e667a50d98
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <uapi/linux/netfilter_ipv4.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
+ const struct nf_hook_state *s)
+{
+ const struct bpf_prog *prog = bpf_prog;
+ struct bpf_nf_ctx ctx = {
+ .state = s,
+ .skb = skb,
+ };
+
+ return bpf_prog_run_pin_on_cpu(prog, &ctx);
+}
+
+struct bpf_nf_link {
+ struct bpf_link link;
+ struct nf_hook_ops hook_ops;
+ netns_tracker ns_tracker;
+ struct net *net;
+ u32 dead;
+ const struct nf_defrag_hook *defrag_hook;
+};
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+static const struct nf_defrag_hook *
+get_proto_defrag_hook(struct bpf_nf_link *link,
+ const struct nf_defrag_hook __rcu **ptr_global_hook,
+ const char *mod)
+{
+ const struct nf_defrag_hook *hook;
+ int err;
+
+ /* RCU protects us from races against module unloading */
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ if (!hook) {
+ rcu_read_unlock();
+ err = request_module("%s", mod);
+ if (err)
+ return ERR_PTR(err < 0 ? err : -EINVAL);
+
+ rcu_read_lock();
+ hook = rcu_dereference(*ptr_global_hook);
+ }
+
+ if (hook && try_module_get(hook->owner)) {
+ /* Once we have a refcnt on the module, we no longer need RCU */
+ hook = rcu_pointer_handoff(hook);
+ } else {
+ WARN_ONCE(!hook, "%s has bad registration", mod);
+ hook = ERR_PTR(-ENOENT);
+ }
+ rcu_read_unlock();
+
+ if (!IS_ERR(hook)) {
+ err = hook->enable(link->net);
+ if (err) {
+ module_put(hook->owner);
+ hook = ERR_PTR(err);
+ }
+ }
+
+ return hook;
+}
+#endif
+
+static int bpf_nf_enable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook __maybe_unused *hook;
+
+ switch (link->hook_ops.pf) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+ case NFPROTO_IPV4:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v4_hook, "nf_defrag_ipv4");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ case NFPROTO_IPV6:
+ hook = get_proto_defrag_hook(link, &nf_defrag_v6_hook, "nf_defrag_ipv6");
+ if (IS_ERR(hook))
+ return PTR_ERR(hook);
+
+ link->defrag_hook = hook;
+ return 0;
+#endif
+ default:
+ return -EAFNOSUPPORT;
+ }
+}
+
+static void bpf_nf_disable_defrag(struct bpf_nf_link *link)
+{
+ const struct nf_defrag_hook *hook = link->defrag_hook;
+
+ if (!hook)
+ return;
+ hook->disable(link->net);
+ module_put(hook->owner);
+}
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ if (nf_link->dead)
+ return;
+
+ /* do not double release in case .detach was already called */
+ if (!cmpxchg(&nf_link->dead, 0, 1)) {
+ nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+ bpf_nf_disable_defrag(nf_link);
+ put_net_track(nf_link->net, &nf_link->ns_tracker);
+ }
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+ bpf_nf_link_release(link);
+ return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+ seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+ nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+ nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+ const struct nf_defrag_hook *hook = nf_link->defrag_hook;
+
+ info->netfilter.pf = nf_link->hook_ops.pf;
+ info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+ info->netfilter.priority = nf_link->hook_ops.priority;
+ info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0;
+
+ return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+ .release = bpf_nf_link_release,
+ .dealloc = bpf_nf_link_dealloc,
+ .detach = bpf_nf_link_detach,
+ .show_fdinfo = bpf_nf_link_show_info,
+ .fill_link_info = bpf_nf_link_fill_link_info,
+ .update_prog = bpf_nf_link_update,
+};
+
+static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
+{
+ int prio;
+
+ switch (attr->link_create.netfilter.pf) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
+ return -EPROTO;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (attr->link_create.netfilter.flags & ~BPF_F_NETFILTER_IP_DEFRAG)
+ return -EOPNOTSUPP;
+
+ /* make sure conntrack confirm is always last */
+ prio = attr->link_create.netfilter.priority;
+ if (prio == NF_IP_PRI_FIRST)
+ return -ERANGE; /* sabotage_in and other warts */
+ else if (prio == NF_IP_PRI_LAST)
+ return -ERANGE; /* e.g. conntrack confirm */
+ else if ((attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) &&
+ prio <= NF_IP_PRI_CONNTRACK_DEFRAG)
+ return -ERANGE; /* cannot use defrag if prog runs before nf_defrag */
+
+ return 0;
+}
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct bpf_nf_link *link;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ err = bpf_nf_check_pf_and_hooks(attr);
+ if (err)
+ return err;
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog,
+ attr->link_create.attach_type);
+
+ link->hook_ops.hook = nf_hook_run_bpf;
+ link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+ link->hook_ops.priv = prog;
+
+ link->hook_ops.pf = attr->link_create.netfilter.pf;
+ link->hook_ops.priority = attr->link_create.netfilter.priority;
+ link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+ link->net = net;
+ link->dead = false;
+ link->defrag_hook = NULL;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ if (attr->link_create.netfilter.flags & BPF_F_NETFILTER_IP_DEFRAG) {
+ err = bpf_nf_enable_defrag(link);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
+ err = nf_register_net_hook(net, &link->hook_ops);
+ if (err) {
+ bpf_nf_disable_defrag(link);
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ get_net_track(net, &link->ns_tracker, GFP_KERNEL);
+
+ return bpf_link_settle(&link_primer);
+}
+
+const struct bpf_prog_ops netfilter_prog_ops = {
+ .test_run = bpf_prog_test_run_nf,
+};
+
+static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
+{
+ struct btf *btf;
+ s32 type_id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR_OR_NULL(btf))
+ return false;
+
+ type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+ if (WARN_ON_ONCE(type_id < 0))
+ return false;
+
+ info->btf = btf;
+ info->btf_id = type_id;
+ info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ return true;
+}
+
+static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_nf_ctx, skb):
+ if (size != sizeof_field(struct bpf_nf_ctx, skb))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "sk_buff");
+ case bpf_ctx_range(struct bpf_nf_ctx, state):
+ if (size != sizeof_field(struct bpf_nf_ctx, state))
+ return false;
+
+ return nf_ptr_to_btf_id(info, "nf_hook_state");
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static const struct bpf_func_proto *
+bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+const struct bpf_verifier_ops netfilter_verifier_ops = {
+ .is_valid_access = nf_is_valid_access,
+ .get_func_proto = bpf_nf_func_proto,
+};
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 7554c56b2e63..f1be4dd5cf85 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* count the number of connections matching an arbitrary key.
*
@@ -121,15 +122,68 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
return ERR_PTR(-EAGAIN);
}
+static bool get_ct_or_tuple_from_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conn **ct,
+ struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone **zone,
+ bool *refcounted)
+{
+ const struct nf_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *found_ct;
+
+ found_ct = nf_ct_get(skb, &ctinfo);
+ if (found_ct && !nf_ct_is_template(found_ct)) {
+ *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ *zone = nf_ct_zone(found_ct);
+ *ct = found_ct;
+ return true;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple))
+ return false;
+
+ if (found_ct)
+ *zone = nf_ct_zone(found_ct);
+
+ h = nf_conntrack_find_get(net, *zone, tuple);
+ if (!h)
+ return true;
+
+ found_ct = nf_ct_tuplehash_to_ctrack(h);
+ *refcounted = true;
+ *ct = found_ct;
+
+ return true;
+}
+
static int __nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conn *ct = NULL;
struct nf_conn *found_ct;
unsigned int collect = 0;
+ bool refcounted = false;
+
+ if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted))
+ return -ENOENT;
+
+ if (ct && nf_ct_is_confirmed(ct)) {
+ if (refcounted)
+ nf_ct_put(ct);
+ return -EEXIST;
+ }
+
+ if ((u32)jiffies == list->last_gc)
+ goto add_new_node;
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
@@ -140,10 +194,10 @@ static int __nf_conncount_add(struct net *net,
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
- return 0; /* already exists */
+ goto out_put; /* already exists */
} else {
collect++;
}
@@ -152,7 +206,7 @@ static int __nf_conncount_add(struct net *net,
found_ct = nf_ct_tuplehash_to_ctrack(found);
- if (nf_ct_tuple_equal(&conn->tuple, tuple) &&
+ if (nf_ct_tuple_equal(&conn->tuple, &tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
* We should not see tuples twice unless someone hooks
@@ -161,7 +215,7 @@ static int __nf_conncount_add(struct net *net,
* Attempt to avoid a re-add in this case.
*/
nf_ct_put(found_ct);
- return 0;
+ goto out_put;
} else if (already_closed(found_ct)) {
/*
* we do not care about connections which are
@@ -176,6 +230,7 @@ static int __nf_conncount_add(struct net *net,
nf_ct_put(found_ct);
}
+add_new_node:
if (WARN_ON_ONCE(list->count > INT_MAX))
return -EOVERFLOW;
@@ -183,42 +238,48 @@ static int __nf_conncount_add(struct net *net,
if (conn == NULL)
return -ENOMEM;
- conn->tuple = *tuple;
+ conn->tuple = tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
list_add_tail(&conn->node, &list->head);
list->count++;
+ list->last_gc = (u32)jiffies;
+
+out_put:
+ if (refcounted)
+ nf_ct_put(ct);
return 0;
}
-int nf_conncount_add(struct net *net,
- struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+int nf_conncount_add_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_list *list)
{
int ret;
/* check the saved connections */
spin_lock_bh(&list->list_lock);
- ret = __nf_conncount_add(net, list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, list);
spin_unlock_bh(&list->list_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_conncount_add);
+EXPORT_SYMBOL_GPL(nf_conncount_add_skb);
void nf_conncount_list_init(struct nf_conncount_list *list)
{
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
+ list->last_gc = (u32)jiffies;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
/* Return true if the list is empty. Must be called with BH disabled. */
-bool nf_conncount_gc_list(struct net *net,
- struct nf_conncount_list *list)
+static bool __nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
{
const struct nf_conntrack_tuple_hash *found;
struct nf_conncount_tuple *conn, *conn_n;
@@ -226,8 +287,8 @@ bool nf_conncount_gc_list(struct net *net,
unsigned int collected = 0;
bool ret = false;
- /* don't bother if other cpu is already doing GC */
- if (!spin_trylock(&list->list_lock))
+ /* don't bother if we just did GC */
+ if ((u32)jiffies == READ_ONCE(list->last_gc))
return false;
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
@@ -257,7 +318,22 @@ bool nf_conncount_gc_list(struct net *net,
if (!list->count)
ret = true;
- spin_unlock(&list->list_lock);
+ list->last_gc = (u32)jiffies;
+
+ return ret;
+}
+
+bool nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
+{
+ bool ret;
+
+ /* don't bother if other cpu is already doing GC */
+ if (!spin_trylock_bh(&list->list_lock))
+ return false;
+
+ ret = __nf_conncount_gc_list(net, list);
+ spin_unlock_bh(&list->list_lock);
return ret;
}
@@ -297,20 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
static unsigned int
insert_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
struct rb_root *root,
unsigned int hash,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+ const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
+ bool do_gc = true, refcounted = false;
+ unsigned int count = 0, gc_count = 0;
struct rb_node **rbnode, *parent;
- struct nf_conncount_rb *rbconn;
+ struct nf_conntrack_tuple tuple;
struct nf_conncount_tuple *conn;
- unsigned int count = 0, gc_count = 0;
- u8 keylen = data->keylen;
- bool do_gc = true;
+ struct nf_conncount_rb *rbconn;
+ struct nf_conn *ct = NULL;
spin_lock_bh(&nf_conncount_locks[hash]);
restart:
@@ -321,7 +399,7 @@ restart:
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
parent = *rbnode;
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
rbnode = &((*rbnode)->rb_left);
} else if (diff > 0) {
@@ -329,8 +407,8 @@ restart:
} else {
int ret;
- ret = nf_conncount_add(net, &rbconn->list, tuple, zone);
- if (ret)
+ ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list);
+ if (ret && ret != -EEXIST)
count = 0; /* hotdrop */
else
count = rbconn->list.count;
@@ -353,28 +431,35 @@ restart:
goto restart;
}
- /* expected case: match, insert new node */
- rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
- if (rbconn == NULL)
- goto out_unlock;
+ if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) {
+ /* expected case: match, insert new node */
+ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+ if (rbconn == NULL)
+ goto out_unlock;
- conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
- if (conn == NULL) {
- kmem_cache_free(conncount_rb_cachep, rbconn);
- goto out_unlock;
- }
+ conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+ if (conn == NULL) {
+ kmem_cache_free(conncount_rb_cachep, rbconn);
+ goto out_unlock;
+ }
- conn->tuple = *tuple;
- conn->zone = *zone;
- memcpy(rbconn->key, key, sizeof(u32) * keylen);
+ conn->tuple = tuple;
+ conn->zone = *zone;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
+ memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
- nf_conncount_list_init(&rbconn->list);
- list_add(&conn->node, &rbconn->list.head);
- count = 1;
- rbconn->list.count = count;
+ nf_conncount_list_init(&rbconn->list);
+ list_add(&conn->node, &rbconn->list.head);
+ count = 1;
+ rbconn->list.count = count;
- rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_link_node_rcu(&rbconn->node, parent, rbnode);
+ rb_insert_color(&rbconn->node, root);
+
+ if (refcounted)
+ nf_ct_put(ct);
+ }
out_unlock:
spin_unlock_bh(&nf_conncount_locks[hash]);
return count;
@@ -382,16 +467,15 @@ out_unlock:
static unsigned int
count_tree(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+ const u32 *key)
{
struct rb_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
- u8 keylen = data->keylen;
hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
root = &data->root[hash];
@@ -402,7 +486,7 @@ count_tree(struct net *net,
rbconn = rb_entry(parent, struct nf_conncount_rb, node);
- diff = key_diff(key, rbconn->key, keylen);
+ diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
parent = rcu_dereference_raw(parent->rb_left);
} else if (diff > 0) {
@@ -410,7 +494,7 @@ count_tree(struct net *net,
} else {
int ret;
- if (!tuple) {
+ if (!skb) {
nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
}
@@ -425,19 +509,23 @@ count_tree(struct net *net,
}
/* same source network -> be counted! */
- ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);
+ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
spin_unlock_bh(&rbconn->list.list_lock);
- if (ret)
+ if (ret && ret != -EEXIST) {
return 0; /* hotdrop */
- else
+ } else {
+ /* -EEXIST means add was skipped, update the list */
+ if (ret == -EEXIST)
+ nf_conncount_gc_list(net, &rbconn->list);
return rbconn->list.count;
+ }
}
}
- if (!tuple)
+ if (!skb)
return 0;
- return insert_tree(net, data, root, hash, key, tuple, zone);
+ return insert_tree(net, skb, l3num, data, root, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
@@ -499,24 +587,24 @@ next:
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- * Call with RCU read lock.
+ * If 'skb' is not null, insert the corresponding tuple into the accounting
+ * data structure. Call with RCU read lock.
*/
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+unsigned int nf_conncount_count_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_data *data,
+ const u32 *key)
{
- return count_tree(net, data, key, tuple, zone);
+ return count_tree(net, skb, l3num, data, key);
+
}
-EXPORT_SYMBOL_GPL(nf_conncount_count);
+EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
-struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
- unsigned int keylen)
+struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
- int ret, i;
+ int i;
if (keylen % sizeof(u32) ||
keylen / sizeof(u32) > MAX_KEYLEN ||
@@ -529,12 +617,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
if (!data)
return ERR_PTR(-ENOMEM);
- ret = nf_ct_netns_get(net, family);
- if (ret < 0) {
- kfree(data);
- return ERR_PTR(ret);
- }
-
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
data->root[i] = RB_ROOT;
@@ -571,13 +653,11 @@ static void destroy_tree(struct rb_root *r)
}
}
-void nf_conncount_destroy(struct net *net, unsigned int family,
- struct nf_conncount_data *data)
+void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
{
unsigned int i;
cancel_work_sync(&data->gc_work);
- nf_ct_netns_put(net, family);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
@@ -593,15 +673,11 @@ static int __init nf_conncount_modinit(void)
for (i = 0; i < CONNCOUNT_SLOTS; ++i)
spin_lock_init(&nf_conncount_locks[i]);
- conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",
- sizeof(struct nf_conncount_tuple),
- 0, 0, NULL);
+ conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
- conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",
- sizeof(struct nf_conncount_rb),
- 0, 0, NULL);
+ conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0);
if (!conncount_rb_cachep) {
kmem_cache_destroy(conncount_conn_cachep);
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 49e523cc49d0..385a5f458aba 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -1,11 +1,8 @@
-/* Accouting handling for netfilter. */
+// SPDX-License-Identifier: GPL-2.0-only
+/* Accounting handling for netfilter. */
/*
* (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -25,26 +22,7 @@ static bool nf_ct_acct __read_mostly;
module_param_named(acct, nf_ct_acct, bool, 0644);
MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting.");
-static const struct nf_ct_ext_type acct_extend = {
- .len = sizeof(struct nf_conn_acct),
- .align = __alignof__(struct nf_conn_acct),
- .id = NF_CT_EXT_ACCT,
-};
-
void nf_conntrack_acct_pernet_init(struct net *net)
{
net->ct.sysctl_acct = nf_ct_acct;
}
-
-int nf_conntrack_acct_init(void)
-{
- int ret = nf_ct_extend_register(&acct_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_acct_fini(void)
-{
- nf_ct_extend_unregister(&acct_extend);
-}
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index 20edd589fe06..7be4c35e4795 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Amanda extension for IP connection tracking
*
* (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
* based on HW's ip_conntrack_irc.c as well as other modules
* (C) 2006 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/module.h>
@@ -28,11 +24,13 @@
static unsigned int master_timeout __read_mostly = 300;
static char *ts_algo = "kmp";
+#define HELPER_NAME "amanda"
+
MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
MODULE_DESCRIPTION("Amanda connection tracking module");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_amanda");
-MODULE_ALIAS_NFCT_HELPER("amanda");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
module_param(master_timeout, uint, 0600);
MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
@@ -54,6 +52,7 @@ enum amanda_strings {
SEARCH_DATA,
SEARCH_MESG,
SEARCH_INDEX,
+ SEARCH_STATE,
};
static struct {
@@ -81,6 +80,10 @@ static struct {
.string = "INDEX ",
.len = 6,
},
+ [SEARCH_STATE] = {
+ .string = "STATE ",
+ .len = 6,
+ },
};
static int amanda_help(struct sk_buff *skb,
@@ -103,7 +106,7 @@ static int amanda_help(struct sk_buff *skb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- nf_ct_refresh(ct, skb, master_timeout * HZ);
+ nf_ct_refresh(ct, master_timeout * HZ);
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
@@ -124,7 +127,7 @@ static int amanda_help(struct sk_buff *skb,
goto out;
stop += start;
- for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) {
+ for (i = SEARCH_DATA; i <= SEARCH_STATE; i++) {
off = skb_find_text(skb, start, stop, search[i].ts);
if (off == UINT_MAX)
continue;
@@ -156,7 +159,7 @@ static int amanda_help(struct sk_buff *skb,
if (nf_nat_amanda && ct->status & IPS_NAT_MASK)
ret = nf_nat_amanda(skb, ctinfo, protoff,
off - dataoff, len, exp);
- else if (nf_ct_expect_related(exp) != 0) {
+ else if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
}
@@ -168,19 +171,20 @@ out:
}
static const struct nf_conntrack_expect_policy amanda_exp_policy = {
- .max_expected = 3,
+ .max_expected = 4,
.timeout = 180,
};
static struct nf_conntrack_helper amanda_helper[2] __read_mostly = {
{
- .name = "amanda",
+ .name = HELPER_NAME,
.me = THIS_MODULE,
.help = amanda_help,
.tuple.src.l3num = AF_INET,
.tuple.src.u.udp.port = cpu_to_be16(10080),
.tuple.dst.protonum = IPPROTO_UDP,
.expect_policy = &amanda_exp_policy,
+ .nat_mod_name = NF_NAT_HELPER_NAME(HELPER_NAME),
},
{
.name = "amanda",
@@ -190,6 +194,7 @@ static struct nf_conntrack_helper amanda_helper[2] __read_mostly = {
.tuple.src.u.udp.port = cpu_to_be16(10080),
.tuple.dst.protonum = IPPROTO_UDP,
.expect_policy = &amanda_exp_policy,
+ .nat_mod_name = NF_NAT_HELPER_NAME(HELPER_NAME),
},
};
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
new file mode 100644
index 000000000000..4a136fc3a9c0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Conntrack Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/btf_ids.h>
+#include <linux/net_namespace.h>
+#include <net/xdp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+
+/* bpf_ct_opts - Options for CT lookup helpers
+ *
+ * Members:
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx (xdp_md, __sk_buff)
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - Passed NULL for bpf_tuple pointer
+ * -EINVAL - opts->reserved is not 0
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
+ * -EINVAL - opts->ct_zone_id set when
+ opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
+ * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
+ * -ENONET - No network namespace found for netns_id
+ * -ENOENT - Conntrack lookup could not find entry for tuple
+ * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4)
+ * or sizeof(tuple->ipv6)
+ * @l4proto - Layer 4 protocol
+ * Values:
+ * IPPROTO_TCP, IPPROTO_UDP
+ * @dir: - connection tracking tuple direction.
+ * @ct_zone_id - connection tracking zone id.
+ * @ct_zone_dir - connection tracking zone direction.
+ * @reserved - Reserved member, will be reused for more options in future
+ * Values:
+ * 0
+ */
+struct bpf_ct_opts {
+ s32 netns_id;
+ s32 error;
+ u8 l4proto;
+ u8 dir;
+ u16 ct_zone_id;
+ u8 ct_zone_dir;
+ u8 reserved[3];
+};
+
+enum {
+ NF_BPF_CT_OPTS_SZ = 16,
+};
+
+static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, u8 protonum, u8 dir,
+ struct nf_conntrack_tuple *tuple)
+{
+ union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3;
+ union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3;
+ union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u
+ : &tuple->src.u;
+ union nf_conntrack_man_proto *dport = dir ? &tuple->src.u
+ : (void *)&tuple->dst.u;
+
+ if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
+ return -EPROTO;
+
+ memset(tuple, 0, sizeof(*tuple));
+
+ switch (tuple_len) {
+ case sizeof(bpf_tuple->ipv4):
+ tuple->src.l3num = AF_INET;
+ src->ip = bpf_tuple->ipv4.saddr;
+ sport->tcp.port = bpf_tuple->ipv4.sport;
+ dst->ip = bpf_tuple->ipv4.daddr;
+ dport->tcp.port = bpf_tuple->ipv4.dport;
+ break;
+ case sizeof(bpf_tuple->ipv6):
+ tuple->src.l3num = AF_INET6;
+ memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr));
+ sport->tcp.port = bpf_tuple->ipv6.sport;
+ memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr));
+ dport->tcp.port = bpf_tuple->ipv6.dport;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+
+ return 0;
+}
+
+static struct nf_conn *
+__bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len,
+ u32 timeout)
+{
+ struct nf_conntrack_tuple otuple, rtuple;
+ struct nf_conntrack_zone ct_zone;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple)
+ return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &otuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_REPLY, &rtuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple,
+ GFP_ATOMIC);
+ if (IS_ERR(ct))
+ goto out;
+
+ memset(&ct->proto, 0, sizeof(ct->proto));
+ __nf_ct_set_timeout(ct, timeout * HZ);
+
+out:
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return ct;
+}
+
+static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
+ struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple_len, struct bpf_ct_opts *opts,
+ u32 opts_len)
+{
+ struct nf_conntrack_tuple_hash *hash;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_zone ct_zone;
+ struct nf_conn *ct;
+ int err;
+
+ if (!opts || !bpf_tuple)
+ return ERR_PTR(-EINVAL);
+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+ return ERR_PTR(-EINVAL);
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (opts->ct_zone_id)
+ return ERR_PTR(-EINVAL);
+ }
+ if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
+ return ERR_PTR(-EPROTO);
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto,
+ IP_CT_DIR_ORIGINAL, &tuple);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net))
+ return ERR_PTR(-ENONET);
+ }
+
+ if (opts_len == NF_BPF_CT_OPTS_SZ) {
+ if (opts->ct_zone_dir == 0)
+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+ nf_ct_zone_init(&ct_zone,
+ opts->ct_zone_id, opts->ct_zone_dir, 0);
+ } else {
+ ct_zone = nf_ct_zone_dflt;
+ }
+
+ hash = nf_conntrack_find_get(net, &ct_zone, &tuple);
+ if (opts->netns_id >= 0)
+ put_net(net);
+ if (!hash)
+ return ERR_PTR(-ENOENT);
+
+ ct = nf_ct_tuplehash_to_ctrack(hash);
+ opts->dir = NF_CT_DIRECTION(hash);
+
+ return ct;
+}
+
+BTF_ID_LIST(btf_nf_conn_ids)
+BTF_ID(struct, nf_conn)
+BTF_ID(struct, nf_conn___init)
+
+/* Check writes into `struct nf_conn` */
+static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *ncit, *nct, *t;
+ size_t end;
+
+ ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]);
+ nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t != nct && t != ncit) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ /* `struct nf_conn` and `struct nf_conn___init` have the same layout
+ * so we are safe to simply merge offset checks here
+ */
+ switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+ case offsetof(struct nf_conn, mark):
+ end = offsetofend(struct nf_conn, mark);
+ break;
+#endif
+ default:
+ bpf_log(log, "no write support to nf_conn at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+__bpf_kfunc_start_defs();
+
+/* bpf_xdp_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn___init *
+bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct nf_conn *nfct;
+
+ nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz,
+ opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn *
+bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = dev_net(ctx->rxq->dev);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_skb_ct_alloc - Allocate a new CT entry
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for allocation (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn___init *
+bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct nf_conn *nfct;
+ struct net *net;
+
+ net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+
+ return (struct nf_conn___init *)nfct;
+}
+
+/* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a
+ * reference to it
+ *
+ * Parameters:
+ * @skb_ctx - Pointer to ctx (__sk_buff) in TC program
+ * Cannot be NULL
+ * @bpf_tuple - Pointer to memory representing the tuple to look up
+ * Cannot be NULL
+ * @tuple__sz - Length of the tuple structure
+ * Must be one of sizeof(bpf_tuple->ipv4) or
+ * sizeof(bpf_tuple->ipv6)
+ * @opts - Additional options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_ct_opts structure
+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12
+ */
+__bpf_kfunc struct nf_conn *
+bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
+ u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct net *caller_net;
+ struct nf_conn *nfct;
+
+ caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+ nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz);
+ if (IS_ERR(nfct)) {
+ if (opts)
+ opts->error = PTR_ERR(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_insert_entry - Add the provided entry into a CT map
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID.
+ *
+ * @nfct - Pointer to referenced nf_conn___init object, obtained
+ * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ */
+__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+{
+ struct nf_conn *nfct = (struct nf_conn *)nfct_i;
+ int err;
+
+ if (!nf_ct_is_confirmed(nfct))
+ nfct->timeout += nfct_time_stamp;
+ nfct->status |= IPS_CONFIRMED;
+ err = nf_conntrack_hash_check_insert(nfct);
+ if (err < 0) {
+ nf_conntrack_free(nfct);
+ return NULL;
+ }
+ return nfct;
+}
+
+/* bpf_ct_release - Release acquired nf_conn object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @nf_conn - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ */
+__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
+{
+ nf_ct_put(nfct);
+}
+
+/* bpf_ct_set_timeout - Set timeout of allocated nf_conn
+ *
+ * Sets the default timeout of newly allocated nf_conn before insertion.
+ * This helper must be invoked for refcounted pointer to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @timeout - Timeout in msecs.
+ */
+__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+{
+ __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_change_timeout - Change timeout of inserted nf_conn
+ *
+ * Change timeout associated of the inserted or looked up nf_conn.
+ * This helper must be invoked for refcounted pointer to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
+ * @timeout - New timeout in msecs.
+ */
+__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+{
+ return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
+}
+
+/* bpf_ct_set_status - Set status field of allocated nf_conn
+ *
+ * Set the status field of the newly allocated nf_conn before insertion.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @status - New status value.
+ */
+__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+{
+ return nf_ct_change_status_common((struct nf_conn *)nfct, status);
+}
+
+/* bpf_ct_change_status - Change status of inserted nf_conn
+ *
+ * Change the status field of the provided connection tracking entry.
+ * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
+ * @status - New status value.
+ */
+__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+{
+ return nf_ct_change_status_common(nfct, status);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_ct_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(nf_ct_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ct_kfunc_set,
+};
+
+int register_nf_conntrack_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+ if (!ret) {
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
+ mutex_unlock(&nf_conn_btf_access_lock);
+ }
+
+ return ret;
+}
+
+void cleanup_nf_conntrack_bpf(void)
+{
+ mutex_lock(&nf_conn_btf_access_lock);
+ nfct_btf_struct_access = NULL;
+ mutex_unlock(&nf_conn_btf_access_lock);
+}
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 5423b197d98a..a7552a46d6ac 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* broadcast connection tracking helper
*
* (c) 2005 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -24,6 +20,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
unsigned int timeout)
{
+ const struct nf_conntrack_helper *helper;
struct nf_conntrack_expect *exp;
struct iphdr *iph = ip_hdr(skb);
struct rtable *rt = skb_rtable(skb);
@@ -41,12 +38,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
in_dev = __in_dev_get_rcu(rt->dst.dev);
if (in_dev != NULL) {
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
if (ifa->ifa_broadcast == iph->daddr) {
mask = ifa->ifa_mask;
break;
}
- } endfor_ifa(in_dev);
+ }
}
if (mask == 0)
@@ -57,7 +59,10 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
goto out;
exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+ helper = rcu_dereference(help->helper);
+ if (helper)
+ exp->tuple.src.u.udp.port = helper->tuple.src.u.udp.port;
exp->mask.src.u3.ip = mask;
exp->mask.src.u.udp.port = htons(0xFFFF);
@@ -67,13 +72,14 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
exp->helper = NULL;
- nf_ct_expect_related(exp);
+ nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
- nf_ct_refresh(ct, skb, timeout * HZ);
+ nf_ct_refresh(ct, timeout * HZ);
out:
return NF_ACCEPT;
}
EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Broadcast connection tracking helper");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 741b533148ba..0b95f226f211 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Connection state tracking for netfilter. This is separated from,
but required by, the NAT layer; it can also be used by an iptables
extension. */
@@ -6,10 +7,6 @@
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
* (C) 2005-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -24,7 +21,7 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
@@ -37,10 +34,10 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -51,7 +48,6 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
#include <net/ip.h>
@@ -69,23 +65,39 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
struct conntrack_gc_work {
struct delayed_work dwork;
- u32 last_bucket;
+ u32 next_bucket;
+ u32 avg_timeout;
+ u32 count;
+ u32 start_time;
bool exiting;
bool early_drop;
- long next_gc_run;
};
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
-static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
+static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
static __read_mostly bool nf_conntrack_locks_all;
-/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */
-#define GC_MAX_BUCKETS_DIV 128u
-/* upper bound of full table scan */
-#define GC_MAX_SCAN_JIFFIES (16u * HZ)
-/* desired ratio of entries found to be expired */
-#define GC_EVICT_RATIO 50u
+/* serialize hash resizes and nf_ct_iterate_cleanup */
+static DEFINE_MUTEX(nf_conntrack_mutex);
+
+#define GC_SCAN_INTERVAL_MAX (60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN (1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
+
+/* Initial bias pretending we have 100 entries at the upper bound so we don't
+ * wakeup often just because we have three entries with a 1s timeout while still
+ * allowing non-idle machines to wakeup more often when needed.
+ */
+#define GC_SCAN_INITIAL_COUNT 100
+#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX
+
+#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
+
+#define MIN_CHAINLEN 50u
+#define MAX_CHAINLEN (80u - MIN_CHAINLEN)
static struct conntrack_gc_work conntrack_gc_work;
@@ -124,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
}
/* return true if we need to recompute hashes (in case hash table was resized) */
-static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
- unsigned int h2, unsigned int sequence)
+static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2,
+ unsigned int sequence)
{
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
@@ -147,12 +159,21 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
}
static void nf_conntrack_all_lock(void)
+ __acquires(&nf_conntrack_locks_all_lock)
{
int i;
spin_lock(&nf_conntrack_locks_all_lock);
- nf_conntrack_locks_all = true;
+ /* For nf_contrack_locks_all, only the latest time when another
+ * CPU will see an update is controlled, by the "release" of the
+ * spin_lock below.
+ * The earliest time is not controlled, an thus KCSAN could detect
+ * a race when nf_conntract_lock() reads the variable.
+ * WRITE_ONCE() is used to ensure the compiler will not
+ * optimize the write.
+ */
+ WRITE_ONCE(nf_conntrack_locks_all, true);
for (i = 0; i < CONNTRACK_LOCKS; i++) {
spin_lock(&nf_conntrack_locks[i]);
@@ -166,6 +187,7 @@ static void nf_conntrack_all_lock(void)
}
static void nf_conntrack_all_unlock(void)
+ __releases(&nf_conntrack_locks_all_lock)
{
/* All prior stores must be complete before we clear
* 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
@@ -182,26 +204,25 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
-seqcount_t nf_conntrack_generation __read_mostly;
-static unsigned int nf_conntrack_hash_rnd __read_mostly;
+seqcount_spinlock_t nf_conntrack_generation __read_mostly;
+static siphash_aligned_key_t nf_conntrack_hash_rnd;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
const struct net *net)
{
- unsigned int n;
- u32 seed;
+ siphash_key_t key;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- /* The direction must be ignored, so we hash everything up to the
- * destination ports (which is a multiple of 4) and treat the last
- * three bytes manually.
- */
- seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
- n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
- return jhash2((u32 *)tuple, n, seed ^
- (((__force __u16)tuple->dst.u.all << 16) |
- tuple->dst.protonum));
+ key = nf_conntrack_hash_rnd;
+
+ key.key[0] ^= zoneid;
+ key.key[1] ^= net_hash_mix(net);
+
+ return siphash((void *)tuple,
+ offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend),
+ &key);
}
static u32 scale_hash(u32 hash)
@@ -211,15 +232,35 @@ static u32 scale_hash(u32 hash)
static u32 __hash_conntrack(const struct net *net,
const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid,
unsigned int size)
{
- return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
+ return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size);
}
static u32 hash_conntrack(const struct net *net,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple,
+ unsigned int zoneid)
{
- return scale_hash(hash_conntrack_raw(tuple, net));
+ return scale_hash(hash_conntrack_raw(tuple, zoneid, net));
+}
+
+static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct nf_conntrack_tuple *tuple)
+{ struct {
+ __be16 sport;
+ __be16 dport;
+ } _inet_hdr, *inet_hdr;
+
+ /* Actually only need first 4 bytes to get ports. */
+ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
+ if (!inet_hdr)
+ return false;
+
+ tuple->src.u.udp.port = inet_hdr->sport;
+ tuple->dst.u.udp.port = inet_hdr->dport;
+ return true;
}
static bool
@@ -229,16 +270,11 @@ nf_ct_get_tuple(const struct sk_buff *skb,
u_int16_t l3num,
u_int8_t protonum,
struct net *net,
- struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l4proto *l4proto)
+ struct nf_conntrack_tuple *tuple)
{
unsigned int size;
const __be32 *ap;
__be32 _addrs[8];
- struct {
- __be16 sport;
- __be16 dport;
- } _inet_hdr, *inet_hdr;
memset(tuple, 0, sizeof(*tuple));
@@ -274,16 +310,31 @@ nf_ct_get_tuple(const struct sk_buff *skb,
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- if (unlikely(l4proto->pkt_to_tuple))
- return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
-
- /* Actually only need first 4 bytes to get ports. */
- inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
- if (!inet_hdr)
- return false;
+ switch (protonum) {
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
+#endif
+ case IPPROTO_ICMP:
+ return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ return gre_pkt_to_tuple(skb, dataoff, net, tuple);
+#endif
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ case IPPROTO_UDPLITE:
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ case IPPROTO_SCTP:
+#endif
+ /* fallthrough */
+ return nf_ct_get_tuple_ports(skb, dataoff, tuple);
+ default:
+ break;
+ }
- tuple->src.u.udp.port = inet_hdr->sport;
- tuple->dst.u.udp.port = inet_hdr->dport;
return true;
}
@@ -366,33 +417,20 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
u_int16_t l3num,
struct net *net, struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_l4proto *l4proto;
u8 protonum;
int protoff;
- int ret;
-
- rcu_read_lock();
protoff = get_l4proto(skb, nhoff, l3num, &protonum);
- if (protoff <= 0) {
- rcu_read_unlock();
+ if (protoff <= 0)
return false;
- }
-
- l4proto = __nf_ct_l4proto_find(protonum);
-
- ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
- l4proto);
- rcu_read_unlock();
- return ret;
+ return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple);
}
EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
- const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_l4proto *l4proto)
+ const struct nf_conntrack_tuple *orig)
{
memset(inverse, 0, sizeof(*inverse));
@@ -415,8 +453,14 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
inverse->dst.protonum = orig->dst.protonum;
- if (unlikely(l4proto->invert_tuple))
- return l4proto->invert_tuple(inverse, orig);
+ switch (orig->dst.protonum) {
+ case IPPROTO_ICMP:
+ return nf_conntrack_invert_icmp_tuple(inverse, orig);
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ return nf_conntrack_invert_icmpv6_tuple(inverse, orig);
+#endif
+ }
inverse->src.u.all = orig->dst.u.all;
inverse->dst.u.all = orig->src.u.all;
@@ -424,10 +468,48 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
}
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
+/* Generate a almost-unique pseudo-id for a given conntrack.
+ *
+ * intentionally doesn't re-use any of the seeds used for hash
+ * table location, we assume id gets exposed to userspace.
+ *
+ * Following nf_conn items do not change throughout lifetime
+ * of the nf_conn:
+ *
+ * 1. nf_conn address
+ * 2. nf_conn->master address (normally NULL)
+ * 3. the associated net namespace
+ * 4. the original direction tuple
+ */
+u32 nf_ct_get_id(const struct nf_conn *ct)
+{
+ static siphash_aligned_key_t ct_id_seed;
+ unsigned long a, b, c, d;
+
+ net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
+
+ a = (unsigned long)ct;
+ b = (unsigned long)ct->master;
+ c = (unsigned long)nf_ct_net(ct);
+ d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
+ &ct_id_seed);
+#ifdef CONFIG_64BIT
+ return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
+#else
+ return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_get_id);
+
+static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct)
+{
+ return nf_ct_get_id(nf_ct_to_nf_conn(nfct));
+}
+
static void
clean_from_lists(struct nf_conn *ct)
{
- pr_debug("clean_from_lists(%p)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
@@ -435,53 +517,9 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_dying_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) dying list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->dying);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) unconfirmed list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->unconfirmed);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* We overload first tuple to link into unconfirmed or dying list.*/
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&pcpu->lock);
-}
-
#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
-/* Released via destroy_conntrack() */
+/* Released via nf_ct_destroy() */
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
gfp_t flags)
@@ -495,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
p = tmpl;
tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
- if (tmpl != p) {
- tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p);
+ if (tmpl != p)
tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p;
- }
} else {
tmpl = kzalloc(sizeof(*tmpl), flags);
if (!tmpl)
@@ -508,7 +544,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
tmpl->status = IPS_TEMPLATE;
write_pnet(&tmpl->ct_net, net);
nf_ct_zone_add(tmpl, zone);
- atomic_set(&tmpl->ct_general.use, 0);
+ refcount_set(&tmpl->ct_general.use, 1);
return tmpl;
}
@@ -516,8 +552,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
void nf_ct_tmpl_free(struct nf_conn *tmpl)
{
- nf_ct_ext_destroy(tmpl);
- nf_ct_ext_free(tmpl);
+ kfree(tmpl->ext);
if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -526,24 +561,30 @@ void nf_ct_tmpl_free(struct nf_conn *tmpl)
}
EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
-static void
-destroy_conntrack(struct nf_conntrack *nfct)
+static void destroy_gre_conntrack(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ struct nf_conn *master = ct->master;
+
+ if (master)
+ nf_ct_gre_keymap_destroy(master);
+#endif
+}
+
+void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
- const struct nf_conntrack_l4proto *l4proto;
- pr_debug("destroy_conntrack(%p)\n", ct);
- WARN_ON(atomic_read(&nfct->use) != 0);
+ WARN_ON(refcount_read(&nfct->use) != 0);
if (unlikely(nf_ct_is_template(ct))) {
nf_ct_tmpl_free(ct);
return;
}
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
- if (l4proto->destroy)
- l4proto->destroy(ct);
- local_bh_disable();
+ if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
+ destroy_gre_conntrack(ct);
+
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
@@ -551,64 +592,90 @@ destroy_conntrack(struct nf_conntrack *nfct)
*/
nf_ct_remove_expectations(ct);
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
- local_bh_enable();
-
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
nf_conntrack_free(ct);
}
+EXPORT_SYMBOL(nf_ct_destroy);
-static void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;
- nf_ct_helper_destroy(ct);
-
- local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
+}
- nf_ct_add_to_dying_list(ct);
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+
+ __nf_ct_delete_from_lists(ct);
local_bh_enable();
}
+static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
+
+ spin_lock(&cnet->ecache.dying_lock);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &cnet->ecache.dying_list);
+ spin_unlock(&cnet->ecache.dying_lock);
+#endif
+}
+
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
+ struct net *net;
if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
return false;
tstamp = nf_conn_tstamp_find(ct);
- if (tstamp && tstamp->stop == 0)
+ if (tstamp) {
+ s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
+
tstamp->stop = ktime_get_real_ns();
+ if (timeout < 0)
+ tstamp->stop -= jiffies_to_nsecs(-timeout);
+ }
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
portid, report) < 0) {
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
- nf_ct_delete_from_lists(ct);
- nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+ __nf_ct_delete_from_lists(ct);
+ nf_ct_add_to_ecache_list(ct);
+ local_bh_enable();
+
+ nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
- nf_conntrack_ecache_work(nf_ct_net(ct));
+ net = nf_ct_net(ct);
+ if (nf_conntrack_ecache_dwork_pending(net))
+ nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT);
nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
return true;
@@ -647,9 +714,12 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
- if (!atomic_inc_not_zero(&ct->ct_general.use))
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
return;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct))
nf_ct_kill(ct);
@@ -683,9 +753,6 @@ begin:
continue;
}
- if (nf_ct_is_dying(ct))
- continue;
-
if (nf_ct_key_equal(h, tuple, zone, net))
return h;
}
@@ -710,22 +777,25 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- rcu_read_lock();
-begin:
h = ____nf_conntrack_find(net, zone, tuple, hash);
if (h) {
+ /* We have a candidate that matches the tuple we're interested
+ * in, try to obtain a reference and re-check tuple
+ */
ct = nf_ct_tuplehash_to_ctrack(h);
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
- h = NULL;
- else {
- if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
- nf_ct_put(ct);
- goto begin;
- }
+ if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
+ /* re-check key after refcount */
+ smp_acquire__after_ctrl_dep();
+
+ if (likely(nf_ct_key_equal(h, tuple, zone, net)))
+ return h;
+
+ /* TYPESAFE_BY_RCU recycled the candidate */
+ nf_ct_put(ct);
}
+
+ h = NULL;
}
- rcu_read_unlock();
return h;
}
@@ -734,8 +804,25 @@ struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
- return __nf_conntrack_find_get(net, zone, tuple,
- hash_conntrack_raw(tuple, net));
+ unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ struct nf_conntrack_tuple_hash *thash;
+
+ rcu_read_lock();
+
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, zone_id, net));
+
+ if (thash)
+ goto out_unlock;
+
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (rid != zone_id)
+ thash = __nf_conntrack_find_get(net, zone, tuple,
+ hash_conntrack_raw(tuple, rid, net));
+
+out_unlock:
+ rcu_read_unlock();
+ return thash;
}
EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
@@ -749,6 +836,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
&nf_conntrack_hash[reply_hash]);
}
+static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
+{
+ /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
+ * may contain stale pointers to e.g. helper that has been removed.
+ *
+ * The helper can't clear this because the nf_conn object isn't in
+ * any hash and synchronize_rcu() isn't enough because associated skb
+ * might sit in a queue.
+ */
+ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
+}
+
+static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
+{
+ if (!ext)
+ return true;
+
+ if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
+ return false;
+
+ /* inserted into conntrack table, nf_ct_iterate_cleanup()
+ * will find it. Disable nf_ct_ext_find() id check.
+ */
+ WRITE_ONCE(ext->gen_id, 0);
+ return true;
+}
+
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
@@ -757,50 +871,82 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int max_chainlen;
+ unsigned int chainlen = 0;
unsigned int sequence;
+ int err = -EEXIST;
zone = nf_ct_zone(ct);
+ if (!nf_ct_ext_valid_pre(ct->ext))
+ return -EAGAIN;
+
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL));
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
+
+ max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
+
+ chainlen = 0;
+
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
+
+ /* If genid has changed, we can't insert anymore because ct
+ * extensions could have stale pointers and nf_ct_iterate_destroy
+ * might have completed its table scan already.
+ *
+ * Increment of the ext genid right after this check is fine:
+ * nf_ct_iterate_destroy blocks until locks are released.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ err = -EAGAIN;
+ goto out;
+ }
smp_wmb();
/* The caller holds a reference to this object */
- atomic_set(&ct->ct_general.use, 2);
+ refcount_set(&ct->ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
- return 0;
+ return 0;
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ err = -ENOSPC;
out:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
- return -EEXIST;
+ return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-static inline void nf_ct_acct_update(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int len)
+void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
+ unsigned int bytes)
{
struct nf_conn_acct *acct;
@@ -808,10 +954,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct,
if (acct) {
struct nf_conn_counter *counter = acct->counter;
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ atomic64_add(packets, &counter[dir].packets);
+ atomic64_add(bytes, &counter[dir].bytes);
}
}
+EXPORT_SYMBOL_GPL(nf_ct_acct_add);
static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
const struct nf_conn *loser_ct)
@@ -825,35 +972,229 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
/* u32 should be fine since we must have seen one packet. */
bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
- nf_ct_acct_update(ct, ctinfo, bytes);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
}
-/* Resolve race on insertion if this protocol allows this. */
-static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- struct nf_conntrack_tuple_hash *h)
+static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
+{
+ struct nf_conn_tstamp *tstamp;
+
+ refcount_inc(&ct->ct_general.use);
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_get_real_ns();
+}
+
+/**
+ * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
+ * @ct1: conntrack in hash table to check against
+ * @ct2: merge candidate
+ *
+ * returns true if ct1 and ct2 happen to refer to the same flow, but
+ * in opposing directions, i.e.
+ * ct1: a:b -> c:d
+ * ct2: c:d -> a:b
+ * for both directions. If so, @ct2 should not have been created
+ * as the skb should have been picked up as ESTABLISHED flow.
+ * But ct1 was not yet committed to hash table before skb that created
+ * ct2 had arrived.
+ *
+ * Note we don't compare netns because ct entries in different net
+ * namespace cannot clash to begin with.
+ *
+ * @return: true if ct1 and ct2 are identical when swapping origin/reply.
+ */
+static bool
+nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ u16 id1, id2;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
+ return false;
+
+ if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
+ if (id1 != id2)
+ return false;
+
+ id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
+ id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);
+
+ return id1 == id2;
+}
+
+static int nf_ct_can_merge(const struct nf_conn *ct,
+ const struct nf_conn *loser_ct)
+{
+ return nf_ct_match(ct, loser_ct) ||
+ nf_ct_match_reverse(ct, loser_ct);
+}
+
+/* caller must hold locks to prevent concurrent changes */
+static int __nf_ct_resolve_clash(struct sk_buff *skb,
+ struct nf_conntrack_tuple_hash *h)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- const struct nf_conntrack_l4proto *l4proto;
- enum ip_conntrack_info oldinfo;
- struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
-
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
- if (l4proto->allow_clash &&
- !nf_ct_is_dying(ct) &&
- atomic_inc_not_zero(&ct->ct_general.use)) {
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
- nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
- nf_ct_set(skb, ct, oldinfo);
- return NF_ACCEPT;
- }
- nf_ct_put(ct);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+
+ if (nf_ct_can_merge(ct, loser_ct)) {
+ struct net *net = nf_ct_net(ct);
+
+ nf_conntrack_get(&ct->ct_general);
+
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_ct_put(loser_ct);
+ nf_ct_set(skb, ct, ctinfo);
+
+ NF_CT_STAT_INC(net, clash_resolve);
+ return NF_ACCEPT;
}
+
+ return NF_DROP;
+}
+
+/**
+ * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
+ *
+ * @skb: skb that causes the collision
+ * @repl_idx: hash slot for reply direction
+ *
+ * Called when origin or reply direction had a clash.
+ * The skb can be handled without packet drop provided the reply direction
+ * is unique or there the existing entry has the identical tuple in both
+ * directions.
+ *
+ * Caller must hold conntrack table locks to prevent concurrent updates.
+ *
+ * Returns NF_DROP if the clash could not be handled.
+ */
+static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
+{
+ struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct net *net;
+
+ zone = nf_ct_zone(loser_ct);
+ net = nf_ct_net(loser_ct);
+
+ /* Reply direction must never result in a clash, unless both origin
+ * and reply tuples are identical.
+ */
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
+ if (nf_ct_key_equal(h,
+ &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
+ return __nf_ct_resolve_clash(skb, h);
+ }
+
+ /* We want the clashing entry to go away real soon: 1 second timeout. */
+ WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
+
+ /* IPS_NAT_CLASH removes the entry automatically on the first
+ * reply. Also prevents UDP tracker from moving the entry to
+ * ASSURED state, i.e. the entry can always be evicted under
+ * pressure.
+ */
+ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
+
+ __nf_conntrack_insert_prepare(loser_ct);
+
+ /* fake add for ORIGINAL dir: we want lookups to only find the entry
+ * already in the table. This also hides the clashing entry from
+ * ctnetlink iteration, i.e. conntrack -L won't show them.
+ */
+ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+
+ hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+ &nf_conntrack_hash[repl_idx]);
+ /* confirmed bit must be set after hlist add, not before:
+ * loser_ct can still be visible to other cpu due to
+ * SLAB_TYPESAFE_BY_RCU.
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &loser_ct->status);
+
+ NF_CT_STAT_INC(net, clash_resolve);
+ return NF_ACCEPT;
+}
+
+/**
+ * nf_ct_resolve_clash - attempt to handle clash without packet drop
+ *
+ * @skb: skb that causes the clash
+ * @h: tuplehash of the clashing entry already in table
+ * @reply_hash: hash slot for reply direction
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple.
+ *
+ * If there is one, @skb (and the associated, unconfirmed conntrack) has
+ * to be dropped. In case @skb is retransmitted, next conntrack lookup
+ * will find the already-existing entry.
+ *
+ * The major problem with such packet drop is the extra delay added by
+ * the packet loss -- it will take some time for a retransmit to occur
+ * (or the sender to time out when waiting for a reply).
+ *
+ * This function attempts to handle the situation without packet drop.
+ *
+ * If @skb has no NAT transformation or if the colliding entries are
+ * exactly the same, only the to-be-confirmed conntrack entry is discarded
+ * and @skb is associated with the conntrack entry already in the table.
+ *
+ * Failing that, the new, unconfirmed conntrack is still added to the table
+ * provided that the collision only occurs in the ORIGINAL direction.
+ * The new entry will be added only in the non-clashing REPLY direction,
+ * so packets in the ORIGINAL direction will continue to match the existing
+ * entry. The new entry will also have a fixed timeout so it expires --
+ * due to the collision, it will only see reply traffic.
+ *
+ * Returns NF_DROP if the clash could not be resolved.
+ */
+static __cold noinline int
+nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
+ u32 reply_hash)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ const struct nf_conntrack_l4proto *l4proto;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+ struct net *net;
+ int ret;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+ net = nf_ct_net(loser_ct);
+
+ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
+ if (!l4proto->allow_clash)
+ goto drop;
+
+ ret = __nf_ct_resolve_clash(skb, h);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+ ret = nf_ct_resolve_clash_harder(skb, reply_hash);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+drop:
NF_CT_STAT_INC(net, drop);
+ NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
}
@@ -861,16 +1202,15 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
+ unsigned int chainlen = 0, sequence, max_chainlen;
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
- struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
- unsigned int sequence;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
@@ -892,69 +1232,104 @@ __nf_conntrack_confirm(struct sk_buff *skb)
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-
- } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY));
+ } while (nf_conntrack_double_lock(hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
* REJECT will give spurious warnings here.
*/
- /* No external references means no one else could have
- * confirmed us.
+ /* Another skb with the same unconfirmed conntrack may
+ * win the race. This may happen for bridge(br_flood)
+ * or broadcast/multicast packets do skb_clone with
+ * unconfirmed conntrack.
*/
- WARN_ON(nf_ct_is_confirmed(ct));
- pr_debug("Confirming conntrack %p\n", ct);
+ if (unlikely(nf_ct_is_confirmed(ct))) {
+ WARN_ON_ONCE(1);
+ nf_conntrack_double_unlock(hash, reply_hash);
+ local_bh_enable();
+ return NF_DROP;
+ }
+
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ goto dying;
+ }
+
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
if (unlikely(nf_ct_is_dying(ct))) {
- nf_ct_add_to_dying_list(ct);
+ NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
+ max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen)
+ goto chaintoolong;
+ }
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
+ chainlen = 0;
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
+ if (chainlen++ > max_chainlen) {
+chaintoolong:
+ NF_CT_STAT_INC(net, chaintoolong);
+ NF_CT_STAT_INC(net, insert_failed);
+ ret = NF_DROP;
+ goto dying;
+ }
+ }
- /* Timer relative to confirmation time, not original
+ /* Timeout is relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
- /* set conntrack timestamp, if enabled. */
- tstamp = nf_conn_tstamp_find(ct);
- if (tstamp) {
- if (skb->tstamp == 0)
- __net_timestamp(skb);
+ __nf_conntrack_insert_prepare(ct);
- tstamp->start = ktime_to_ns(skb->tstamp);
- }
/* Since the lookup is lockless, hash insertion must be done after
- * starting the timer and setting the CONFIRMED bit. The RCU barriers
- * guarantee that no other CPU can find the conntrack before the above
- * stores are visible.
+ * setting ct->timeout. The RCU barriers guarantee that no other CPU
+ * can find the conntrack before the above stores are visible.
*/
__nf_conntrack_hash_insert(ct, hash, reply_hash);
+
+ /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups
+ * skip entries that lack this bit. This happens when a CPU is looking
+ * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU
+ * or when another CPU encounters this entry right after the insertion
+ * but before the set-confirm-bit below. This bit must not be set until
+ * after __nf_conntrack_hash_insert().
+ */
+ smp_mb__before_atomic();
+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
+ /* ext area is still valid (rcu read lock is held,
+ * but will go out of scope soon, we need to remove
+ * this conntrack again.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC_ATOMIC(net, drop);
+ return NF_DROP;
+ }
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -964,17 +1339,15 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
out:
- nf_ct_add_to_dying_list(ct);
- ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+ ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
-/* Returns true if a connection correspondings to the tuple (required
+/* Returns true if a connection corresponds to the tuple (required
for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
@@ -993,7 +1366,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
rcu_read_lock();
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
- hash = __hash_conntrack(net, tuple, hsize);
+ hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -1007,6 +1380,23 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
}
if (nf_ct_key_equal(h, tuple, zone, net)) {
+ /* Tuple is taken already, so caller will need to find
+ * a new source port to use.
+ *
+ * Only exception:
+ * If the *original tuples* are identical, then both
+ * conntracks refer to the same flow.
+ * This is a rare situation, it can occur e.g. when
+ * more than one UDP packet is sent from same socket
+ * in different threads.
+ *
+ * Let nf_ct_resolve_clash() deal with this later.
+ */
+ if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
+ continue;
+
NF_CT_STAT_INC_ATOMIC(net, found);
rcu_read_unlock();
return 1;
@@ -1039,9 +1429,6 @@ static unsigned int early_drop_list(struct net *net,
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
- continue;
-
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
@@ -1052,9 +1439,12 @@ static unsigned int early_drop_list(struct net *net,
nf_ct_is_dying(tmp))
continue;
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->ct_net and ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
/* kill only if still in same netns -- might have moved due to
* SLAB_TYPESAFE_BY_RCU rules.
*
@@ -1108,68 +1498,78 @@ static bool gc_worker_skip_ct(const struct nf_conn *ct)
static bool gc_worker_can_early_drop(const struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
+ u8 protonum = nf_ct_protonum(ct);
if (!test_bit(IPS_ASSURED_BIT, &ct->status))
return true;
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(protonum);
if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
return true;
return false;
}
-#define DAY (86400 * HZ)
-
-/* Set an arbitrary timeout large enough not to ever expire, this save
- * us a check for the IPS_OFFLOAD_BIT from the packet path via
- * nf_ct_is_expired().
- */
-static void nf_ct_offload_timeout(struct nf_conn *ct)
-{
- if (nf_ct_expires(ct) < DAY / 2)
- ct->timeout = nfct_time_stamp + DAY;
-}
-
static void gc_worker(struct work_struct *work)
{
- unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
- unsigned int i, goal, buckets = 0, expired_count = 0;
- unsigned int nf_conntrack_max95 = 0;
+ unsigned int i, hashsz, nf_conntrack_max95 = 0;
+ u32 end_time, start_time = nfct_time_stamp;
struct conntrack_gc_work *gc_work;
- unsigned int ratio, scanned = 0;
+ unsigned int expired_count = 0;
unsigned long next_run;
+ s32 delta_time;
+ long count;
gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
- goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV;
- i = gc_work->last_bucket;
+ i = gc_work->next_bucket;
if (gc_work->early_drop)
nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+ if (i == 0) {
+ gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+ gc_work->count = GC_SCAN_INITIAL_COUNT;
+ gc_work->start_time = start_time;
+ }
+
+ next_run = gc_work->avg_timeout;
+ count = gc_work->count;
+
+ end_time = start_time + GC_SCAN_MAX_DURATION;
+
do {
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
- unsigned int hashsz;
struct nf_conn *tmp;
- i++;
rcu_read_lock();
nf_conntrack_get_ht(&ct_hash, &hashsz);
- if (i >= hashsz)
- i = 0;
+ if (i >= hashsz) {
+ rcu_read_unlock();
+ break;
+ }
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+ struct nf_conntrack_net *cnet;
struct net *net;
+ long expires;
tmp = nf_ct_tuplehash_to_ctrack(h);
- scanned++;
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- continue;
+ if (expired_count > GC_SCAN_EXPIRED_MAX) {
+ rcu_read_unlock();
+
+ gc_work->next_bucket = i;
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+
+ delta_time = nfct_time_stamp - gc_work->start_time;
+
+ /* re-sched immediately if total cycle time is exceeded */
+ next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+ goto early_exit;
}
if (nf_ct_is_expired(tmp)) {
@@ -1178,24 +1578,34 @@ static void gc_worker(struct work_struct *work)
continue;
}
+ expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+ expires = (expires - (long)next_run) / ++count;
+ next_run += expires;
+
if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
continue;
net = nf_ct_net(tmp);
- if (atomic_read(&net->ct.count) < nf_conntrack_max95)
+ cnet = nf_ct_pernet(net);
+ if (atomic_read(&cnet->count) < nf_conntrack_max95)
continue;
/* need to take reference to avoid possible races */
- if (!atomic_inc_not_zero(&tmp->ct_general.use))
+ if (!refcount_inc_not_zero(&tmp->ct_general.use))
continue;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (gc_worker_skip_ct(tmp)) {
nf_ct_put(tmp);
continue;
}
- if (gc_worker_can_early_drop(tmp))
+ if (gc_worker_can_early_drop(tmp)) {
nf_ct_kill(tmp);
+ expired_count++;
+ }
nf_ct_put(tmp);
}
@@ -1206,51 +1616,41 @@ static void gc_worker(struct work_struct *work)
*/
rcu_read_unlock();
cond_resched();
- } while (++buckets < goal);
+ i++;
- if (gc_work->exiting)
- return;
+ delta_time = nfct_time_stamp - end_time;
+ if (delta_time > 0 && i < hashsz) {
+ gc_work->avg_timeout = next_run;
+ gc_work->count = count;
+ gc_work->next_bucket = i;
+ next_run = 0;
+ goto early_exit;
+ }
+ } while (i < hashsz);
- /*
- * Eviction will normally happen from the packet path, and not
- * from this gc worker.
- *
- * This worker is only here to reap expired entries when system went
- * idle after a busy period.
- *
- * The heuristics below are supposed to balance conflicting goals:
- *
- * 1. Minimize time until we notice a stale entry
- * 2. Maximize scan intervals to not waste cycles
- *
- * Normally, expire ratio will be close to 0.
- *
- * As soon as a sizeable fraction of the entries have expired
- * increase scan frequency.
- */
- ratio = scanned ? expired_count * 100 / scanned : 0;
- if (ratio > GC_EVICT_RATIO) {
- gc_work->next_gc_run = min_interval;
- } else {
- unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
+ gc_work->next_bucket = 0;
- BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0);
+ next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
- gc_work->next_gc_run += min_interval;
- if (gc_work->next_gc_run > max)
- gc_work->next_gc_run = max;
- }
+ delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+ if (next_run > (unsigned long)delta_time)
+ next_run -= delta_time;
+ else
+ next_run = 1;
+
+early_exit:
+ if (gc_work->exiting)
+ return;
+
+ if (next_run)
+ gc_work->early_drop = false;
- next_run = gc_work->next_gc_run;
- gc_work->last_bucket = i;
- gc_work->early_drop = false;
queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
}
static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
{
- INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
- gc_work->next_gc_run = HZ;
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
gc_work->exiting = false;
}
@@ -1261,18 +1661,23 @@ __nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *repl,
gfp_t gfp, u32 hash)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ unsigned int ct_count;
struct nf_conn *ct;
/* We don't want any race condition at early drop stage */
- atomic_inc(&net->ct.count);
+ ct_count = atomic_inc_return(&cnet->count);
- if (nf_conntrack_max &&
- unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+ if (unlikely(ct_count > nf_conntrack_max)) {
if (!early_drop(net, hash)) {
if (!conntrack_gc_work.early_drop)
conntrack_gc_work.early_drop = true;
- atomic_dec(&net->ct.count);
- net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ atomic_dec(&cnet->count);
+ if (net == &init_net)
+ net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+ else
+ net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n",
+ net->ns.inum);
return ERR_PTR(-ENOMEM);
}
}
@@ -1292,20 +1697,19 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0;
+ WRITE_ONCE(ct->timeout, 0);
write_pnet(&ct->ct_net, net);
- memset(&ct->__nfct_init_offset[0], 0,
- offsetof(struct nf_conn, proto) -
- offsetof(struct nf_conn, __nfct_init_offset[0]));
+ memset_after(ct, 0, __nfct_init_offset);
nf_ct_zone_add(ct, zone);
/* Because we use RCU lookups, we set ct_general.use to zero before
* this is inserted in any list.
*/
- atomic_set(&ct->ct_general.use, 0);
+ refcount_set(&ct->ct_general.use, 0);
return ct;
out:
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
return ERR_PTR(-ENOMEM);
}
@@ -1322,17 +1726,29 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
void nf_conntrack_free(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_net *cnet;
/* A freed object has refcnt == 0, that's
* the golden rule for SLAB_TYPESAFE_BY_RCU
*/
- WARN_ON(atomic_read(&ct->ct_general.use) != 0);
+ WARN_ON(refcount_read(&ct->ct_general.use) != 0);
+
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ const struct nf_nat_hook *nat_hook;
- nf_ct_ext_destroy(ct);
- nf_ct_ext_free(ct);
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+ if (nat_hook)
+ nat_hook->remove_nat_bysrc(ct);
+ rcu_read_unlock();
+ }
+
+ kfree(ct->ext);
kmem_cache_free(nf_conntrack_cachep, ct);
+ cnet = nf_ct_pernet(net);
+
smp_mb__before_atomic();
- atomic_dec(&net->ct.count);
+ atomic_dec(&cnet->count);
}
EXPORT_SYMBOL_GPL(nf_conntrack_free);
@@ -1342,29 +1758,29 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_ecache *ecache;
+#endif
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
+ struct nf_conntrack_net *cnet;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
- pr_debug("Can't invert tuple.\n");
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple))
return NULL;
- }
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
- return (struct nf_conntrack_tuple_hash *)ct;
+ return ERR_CAST(ct);
if (!nf_ct_add_synproxy(ct, tmpl)) {
nf_conntrack_free(ct);
@@ -1381,18 +1797,23 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
- nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
- ecache ? ecache->expmask : 0,
- GFP_ATOMIC);
- local_bh_disable();
- if (net->ct.expect_count) {
- spin_lock(&nf_conntrack_expect_lock);
- exp = nf_ct_find_expectation(net, zone, tuple);
+ if ((ecache || net->ct.sysctl_events) &&
+ !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
+ }
+#endif
+
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count) {
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl));
if (exp) {
- pr_debug("expectation arrives ct=%p exp=%p\n",
- ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
@@ -1404,23 +1825,30 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- ct->mark = exp->master->mark;
+ ct->mark = READ_ONCE(exp->master->mark);
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
ct->secmark = exp->master->secmark;
#endif
NF_CT_STAT_INC(net, expect_new);
}
- spin_unlock(&nf_conntrack_expect_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
- if (!exp)
+ if (!exp && tmpl)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- /* Now it is inserted into the unconfirmed list, bump refcount */
- nf_conntrack_get(&ct->ct_general);
- nf_ct_add_to_unconfirmed_list(ct);
+ /* Other CPU might have obtained a pointer to this object before it was
+ * released. Because refcount is 0, refcount_inc_not_zero() will fail.
+ *
+ * After refcount_set(1) it will succeed; ensure that zeroing of
+ * ct->status and the correct ct->net pointer are visible; else other
+ * core might observe CONFIRMED bit which means the entry is valid and
+ * in the hash table, but its not (anymore).
+ */
+ smp_wmb();
- local_bh_enable();
+ /* Now it is going to be associated with an sk_buff, set refcount to 1. */
+ refcount_set(&ct->ct_general.use, 1);
if (exp) {
if (exp->expectfn)
@@ -1437,7 +1865,6 @@ resolve_normal_ct(struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
u_int8_t protonum,
- const struct nf_conntrack_l4proto *l4proto,
const struct nf_hook_state *state)
{
const struct nf_conntrack_zone *zone;
@@ -1445,22 +1872,32 @@ resolve_normal_ct(struct nf_conn *tmpl,
struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
+ u32 hash, zone_id, rid;
struct nf_conn *ct;
- u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, state->pf, protonum, state->net,
- &tuple, l4proto)) {
- pr_debug("Can't get tuple\n");
+ &tuple))
return 0;
- }
/* look for tuple match */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
- hash = hash_conntrack_raw(&tuple, state->net);
+
+ zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
+ hash = hash_conntrack_raw(&tuple, zone_id, state->net);
h = __nf_conntrack_find_get(state->net, zone, &tuple, hash);
+
if (!h) {
- h = init_conntrack(state->net, tmpl, &tuple, l4proto,
+ rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
+ if (zone_id != rid) {
+ u32 tmp = hash_conntrack_raw(&tuple, rid, state->net);
+
+ h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp);
+ }
+ }
+
+ if (!h) {
+ h = init_conntrack(state->net, tmpl, &tuple,
skb, dataoff, hash);
if (!h)
return 0;
@@ -1473,17 +1910,15 @@ resolve_normal_ct(struct nf_conn *tmpl,
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
ctinfo = IP_CT_ESTABLISHED_REPLY;
} else {
+ unsigned long status = READ_ONCE(ct->status);
+
/* Once we've had two way comms, always ESTABLISHED. */
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("normal packet for %p\n", ct);
+ if (likely(status & IPS_SEEN_REPLY))
ctinfo = IP_CT_ESTABLISHED;
- } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("related packet for %p\n", ct);
+ else if (status & IPS_EXPECTED)
ctinfo = IP_CT_RELATED;
- } else {
- pr_debug("new packet for %p\n", ct);
+ else
ctinfo = IP_CT_NEW;
- }
}
nf_ct_set(skb, ct, ctinfo);
return 0;
@@ -1514,18 +1949,67 @@ nf_conntrack_handle_icmp(struct nf_conn *tmpl,
else
return NF_ACCEPT;
- if (ret <= 0) {
+ if (ret <= 0)
NF_CT_STAT_INC_ATOMIC(state->net, error);
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- }
return ret;
}
+static int generic_packet(struct nf_conn *ct, struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+{
+ const unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+ if (!timeout)
+ timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
+
+ nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
+ return NF_ACCEPT;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int nf_conntrack_handle_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
+{
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ return nf_conntrack_tcp_packet(ct, skb, dataoff,
+ ctinfo, state);
+ case IPPROTO_UDP:
+ return nf_conntrack_udp_packet(ct, skb, dataoff,
+ ctinfo, state);
+ case IPPROTO_ICMP:
+ return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ case IPPROTO_UDPLITE:
+ return nf_conntrack_udplite_packet(ct, skb, dataoff,
+ ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ case IPPROTO_SCTP:
+ return nf_conntrack_sctp_packet(ct, skb, dataoff,
+ ctinfo, state);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ return nf_conntrack_gre_packet(ct, skb, dataoff,
+ ctinfo, state);
+#endif
+ }
+
+ return generic_packet(ct, skb, ctinfo);
+}
+
unsigned int
nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
{
- const struct nf_conntrack_l4proto *l4proto;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct, *tmpl;
u_int8_t protonum;
@@ -1535,25 +2019,19 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
- ctinfo == IP_CT_UNTRACKED) {
- NF_CT_STAT_INC_ATOMIC(state->net, ignore);
+ ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
- }
skb->_nfct = 0;
}
/* rcu_read_lock()ed by nf_hook_thresh */
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {
- pr_debug("not prepared to track yet or error occurred\n");
- NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
}
- l4proto = __nf_ct_l4proto_find(protonum);
-
if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
protonum, state);
@@ -1567,7 +2045,7 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
}
repeat:
ret = resolve_normal_ct(tmpl, skb, dataoff,
- protonum, l4proto, state);
+ protonum, state);
if (ret < 0) {
/* Too stressed to deal. */
NF_CT_STAT_INC_ATOMIC(state->net, drop);
@@ -1583,22 +2061,23 @@ repeat:
goto out;
}
- ret = l4proto->packet(ct, skb, dataoff, ctinfo, state);
+ ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
- pr_debug("nf_conntrack_in: Can't track with proto module\n");
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
skb->_nfct = 0;
- NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
- NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
*/
if (ret == -NF_REPEAT)
goto repeat;
+
+ NF_CT_STAT_INC_ATOMIC(state->net, invalid);
+ if (ret == NF_DROP)
+ NF_CT_STAT_INC_ATOMIC(state->net, drop);
+
ret = -ret;
goto out;
}
@@ -1614,51 +2093,12 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_in);
-bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
- const struct nf_conntrack_tuple *orig)
-{
- bool ret;
-
- rcu_read_lock();
- ret = nf_ct_invert_tuple(inverse, orig,
- __nf_ct_l4proto_find(orig->dst.protonum));
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
-
-/* Alter reply tuple (maybe alter helper). This is for NAT, and is
- implicitly racy: see __nf_conntrack_confirm */
-void nf_conntrack_alter_reply(struct nf_conn *ct,
- const struct nf_conntrack_tuple *newreply)
-{
- struct nf_conn_help *help = nfct_help(ct);
-
- /* Should be unconfirmed, so not in hash table yet */
- WARN_ON(nf_ct_is_confirmed(ct));
-
- pr_debug("Altering reply tuple of %p to ", ct);
- nf_ct_dump_tuple(newreply);
-
- ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
- if (ct->master || (help && !hlist_empty(&help->expectations)))
- return;
-
- rcu_read_lock();
- __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
-
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
- unsigned long extra_jiffies,
- int do_acct)
+ u32 extra_jiffies,
+ unsigned int bytes)
{
- WARN_ON(!skb);
-
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;
@@ -1667,10 +2107,11 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (nf_ct_is_confirmed(ct))
extra_jiffies += nfct_time_stamp;
- ct->timeout = extra_jiffies;
+ if (READ_ONCE(ct->timeout) != extra_jiffies)
+ WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
- if (do_acct)
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ if (bytes)
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1678,7 +2119,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb)
{
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
return nf_ct_delete(ct, 0, 0);
}
@@ -1690,9 +2131,7 @@ EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
#include <linux/netfilter/nfnetlink_conntrack.h>
#include <linux/mutex.h>
-/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
- * in ip_conntrack_core, since we don't want the protocols to autoload
- * or depend on ctnetlink */
+/* Generic function for tcp/udp/sctp/dccp and alike. */
int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
@@ -1713,13 +2152,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
+ if (!tb[CTA_PROTO_SRC_PORT])
+ return -EINVAL;
+
+ t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
+ if (!tb[CTA_PROTO_DST_PORT])
+ return -EINVAL;
- t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
- t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+ t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
+ }
return 0;
}
@@ -1755,79 +2203,70 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}
-static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+/* This packet is coming from userspace via nf_queue, complete the packet
+ * processing after the helper invocation in nf_confirm().
+ */
+static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
{
- const struct nf_conntrack_l4proto *l4proto;
- struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
- enum ip_conntrack_info ctinfo;
- struct nf_nat_hook *nat_hook;
- unsigned int status;
- struct nf_conn *ct;
- int dataoff;
- u16 l3num;
- u8 l4num;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || nf_ct_is_confirmed(ct))
- return 0;
-
- l3num = nf_ct_l3num(ct);
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ int protoff;
- dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
- if (dataoff <= 0)
- return -1;
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
- l4proto = nf_ct_l4proto_find_get(l4num);
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple, l4proto))
- return -1;
+ if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
+ return NF_ACCEPT;
- if (ct->status & IPS_SRC_NAT) {
- memcpy(tuple.src.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
- sizeof(tuple.src.u3.all));
- tuple.src.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6: {
+ __be16 frag_off;
+ u8 pnum;
+
+ pnum = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+ return NF_ACCEPT;
+ break;
}
-
- if (ct->status & IPS_DST_NAT) {
- memcpy(tuple.dst.u3.all,
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
- sizeof(tuple.dst.u3.all));
- tuple.dst.u.all =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
+#endif
+ default:
+ return NF_ACCEPT;
}
- h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
- if (!h)
- return 0;
-
- /* Store status bits of the conntrack that is clashing to re-do NAT
- * mangling according to what it has been done already to this packet.
- */
- status = ct->status;
-
- nf_ct_put(ct);
- ct = nf_ct_tuplehash_to_ctrack(h);
- nf_ct_set(skb, ct, ctinfo);
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
- nat_hook = rcu_dereference(nf_nat_hook);
- if (!nat_hook)
- return 0;
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
- if (status & IPS_SRC_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
+static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
- if (status & IPS_DST_NAT &&
- nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST,
- IP_CT_DIR_ORIGINAL) == NF_DROP)
- return -1;
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
- return 0;
+ return nf_confirm_cthelper(skb, ct, ctinfo);
}
static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
@@ -1868,7 +2307,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
- void *data, unsigned int *bucket)
+ const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -1876,17 +2315,36 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
spinlock_t *lockp;
for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
+ struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket];
+
+ if (hlist_nulls_empty(hslot))
+ continue;
+
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
nf_conntrack_lock(lockp);
- if (*bucket < nf_conntrack_htable_size) {
- hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
- goto found;
- }
+ hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
+ continue;
+ /* All nf_conn objects are added to hash table twice, one
+ * for original direction tuple, once for the reply tuple.
+ *
+ * Exception: In the IPS_NAT_CLASH case, only the reply
+ * tuple is added (the original tuple already existed for
+ * a different object).
+ *
+ * We only need to call the iterator once for each
+ * conntrack, so we just use the 'reply' direction
+ * tuple while iterating.
+ */
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (iter_data->net &&
+ !net_eq(iter_data->net, nf_ct_net(ct)))
+ continue;
+
+ if (iter(ct, iter_data->data))
+ goto found;
}
spin_unlock(lockp);
local_bh_enable();
@@ -1895,109 +2353,43 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
return NULL;
found:
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
spin_unlock(lockp);
local_bh_enable();
return ct;
}
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
+ const struct nf_ct_iter_data *iter_data)
{
- unsigned int bucket = 0, sequence;
+ unsigned int bucket = 0;
struct nf_conn *ct;
might_sleep();
- for (;;) {
- sequence = read_seqcount_begin(&nf_conntrack_generation);
-
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
- /* Time to push up daises... */
-
- nf_ct_delete(ct, portid, report);
- nf_ct_put(ct);
- cond_resched();
- }
-
- if (!read_seqcount_retry(&nf_conntrack_generation, sequence))
- break;
- bucket = 0;
- }
-}
-
-struct iter_data {
- int (*iter)(struct nf_conn *i, void *data);
- void *data;
- struct net *net;
-};
+ mutex_lock(&nf_conntrack_mutex);
+ while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
+ /* Time to push up daises... */
-static int iter_net_only(struct nf_conn *i, void *data)
-{
- struct iter_data *d = data;
-
- if (!net_eq(d->net, nf_ct_net(i)))
- return 0;
-
- return d->iter(i, d->data);
-}
-
-static void
-__nf_ct_unconfirmed_destroy(struct net *net)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
- struct ct_pcpu *pcpu;
-
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
- struct nf_conn *ct;
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- /* we cannot call iter() on unconfirmed list, the
- * owning cpu can reallocate ct->ext at any time.
- */
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&pcpu->lock);
+ nf_ct_delete(ct, iter_data->portid, iter_data->report);
+ nf_ct_put(ct);
cond_resched();
}
+ mutex_unlock(&nf_conntrack_mutex);
}
-void nf_ct_unconfirmed_destroy(struct net *net)
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data)
{
- might_sleep();
-
- if (atomic_read(&net->ct.count) > 0) {
- __nf_ct_unconfirmed_destroy(net);
- nf_queue_nf_hook_drop(net);
- synchronize_net();
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
-
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
-{
- struct iter_data d;
+ struct net *net = iter_data->net;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
might_sleep();
- if (atomic_read(&net->ct.count) == 0)
+ if (atomic_read(&cnet->count) == 0)
return;
- d.iter = iter;
- d.data = data;
- d.net = net;
-
- nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
+ nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
@@ -2015,43 +2407,56 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
down_read(&net_rwsem);
for_each_net(net) {
- if (atomic_read(&net->ct.count) == 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (atomic_read(&cnet->count) == 0)
continue;
- __nf_ct_unconfirmed_destroy(net);
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
/* Need to wait for netns cleanup worker to finish, if its
* running -- it might have deleted a net namespace from
- * the global list, so our __nf_ct_unconfirmed_destroy() might
- * not have affected all namespaces.
+ * the global list, so hook drop above might not have
+ * affected all namespaces.
*/
net_ns_barrier();
- /* a conntrack could have been unlinked from unconfirmed list
- * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
+ /* a skb w. unconfirmed conntrack could have been reinjected just
+ * before we called nf_queue_nf_hook_drop().
+ *
* This makes sure its inserted into conntrack table.
*/
synchronize_net();
- nf_ct_iterate_cleanup(iter, data, 0, 0);
+ nf_ct_ext_bump_genid();
+ iter_data.data = data;
+ nf_ct_iterate_cleanup(iter, &iter_data);
+
+ /* Another cpu might be in a rcu read section with
+ * rcu protected pointer cleared in iter callback
+ * or hidden via nf_ct_ext_bump_genid() above.
+ *
+ * Wait until those are done.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
static int kill_all(struct nf_conn *i, void *data)
{
- return net_eq(nf_ct_net(i), data);
+ return 1;
}
void nf_conntrack_cleanup_start(void)
{
+ cleanup_nf_conntrack_bpf();
conntrack_gc_work.exiting = true;
- RCU_INIT_POINTER(ip_ct_attach, NULL);
}
void nf_conntrack_cleanup_end(void)
@@ -2061,13 +2466,7 @@ void nf_conntrack_cleanup_end(void)
kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
- nf_conntrack_seqadj_fini();
- nf_conntrack_labels_fini();
nf_conntrack_helper_fini();
- nf_conntrack_timeout_fini();
- nf_conntrack_ecache_fini();
- nf_conntrack_tstamp_fini();
- nf_conntrack_acct_fini();
nf_conntrack_expect_fini();
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2087,20 +2486,24 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- int busy;
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
+ int busy;
/*
* This makes sure all current packets have passed through
* netfilter framework. Roll on, two-stage module
* delete...
*/
- synchronize_net();
+ synchronize_rcu_expedited();
i_see_dead_people:
busy = 0;
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_ct_iterate_cleanup(kill_all, net, 0, 0);
- if (atomic_read(&net->ct.count) != 0)
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ iter_data.net = net;
+ nf_ct_iterate_cleanup_net(kill_all, &iter_data);
+ if (atomic_read(&cnet->count) != 0)
busy = 1;
}
if (busy) {
@@ -2109,11 +2512,9 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_proto_pernet_fini(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
- free_percpu(net->ct.pcpu_lists);
}
}
@@ -2122,14 +2523,16 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
- if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
+ if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
- hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
- GFP_KERNEL | __GFP_ZERO);
+ if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head)))
+ return NULL;
+
+ hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL);
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -2154,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
if (!hash)
return -ENOMEM;
+ mutex_lock(&nf_conntrack_mutex);
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
+ mutex_unlock(&nf_conntrack_mutex);
kvfree(hash);
return 0;
}
@@ -2172,16 +2577,19 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
for (i = 0; i < nf_conntrack_htable_size; i++) {
while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
+ unsigned int zone_id;
+
h = hlist_nulls_entry(nf_conntrack_hash[i].first,
struct nf_conntrack_tuple_hash, hnnode);
ct = nf_ct_tuplehash_to_ctrack(h);
hlist_nulls_del_rcu(&h->hnnode);
+
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h));
bucket = __hash_conntrack(nf_ct_net(ct),
- &h->tuple, hashsize);
+ &h->tuple, zone_id, hashsize);
hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
}
}
- old_size = nf_conntrack_htable_size;
old_hash = nf_conntrack_hash;
nf_conntrack_hash = hash;
@@ -2191,6 +2599,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
nf_conntrack_all_unlock();
local_bh_enable();
+ mutex_unlock(&nf_conntrack_mutex);
+
synchronize_net();
kvfree(old_hash);
return 0;
@@ -2214,37 +2624,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return nf_conntrack_hash_resize(hashsize);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
-
-static __always_inline unsigned int total_extension_size(void)
-{
- /* remember to add new extensions below */
- BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
-
- return sizeof(struct nf_ct_ext) +
- sizeof(struct nf_conn_help)
-#if IS_ENABLED(CONFIG_NF_NAT)
- + sizeof(struct nf_conn_nat)
-#endif
- + sizeof(struct nf_conn_seqadj)
- + sizeof(struct nf_conn_acct)
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- + sizeof(struct nf_conntrack_ecache)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- + sizeof(struct nf_conn_tstamp)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- + sizeof(struct nf_conn_timeout)
-#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
- + sizeof(struct nf_conn_labels)
-#endif
-#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
- + sizeof(struct nf_conn_synproxy)
-#endif
- ;
-};
int nf_conntrack_init_start(void)
{
@@ -2253,35 +2632,31 @@ int nf_conntrack_init_start(void)
int ret = -ENOMEM;
int i;
- /* struct nf_ct_ext uses u8 to store offsets/size */
- BUILD_BUG_ON(total_extension_size() > 255u);
-
- seqcount_init(&nf_conntrack_generation);
+ seqcount_spinlock_init(&nf_conntrack_generation,
+ &nf_conntrack_locks_all_lock);
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_conntrack_locks[i]);
if (!nf_conntrack_htable_size) {
- /* Idea from tcp.c: use 1/16384 of memory.
- * On i386: 32MB machine has 512 buckets.
- * >= 1GB machines have 16384 buckets.
- * >= 4GB machines have 65536 buckets.
- */
nf_conntrack_htable_size
= (((nr_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
- if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
- nf_conntrack_htable_size = 65536;
+ if (BITS_PER_LONG >= 64 &&
+ nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
+ nf_conntrack_htable_size = 262144;
else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
- nf_conntrack_htable_size = 16384;
- if (nf_conntrack_htable_size < 32)
- nf_conntrack_htable_size = 32;
+ nf_conntrack_htable_size = 65536;
- /* Use a max. factor of four by default to get the same max as
- * with the old struct list_heads. When a table size is given
- * we use the old value of 8 to avoid reducing the max.
- * entries. */
- max_factor = 4;
+ if (nf_conntrack_htable_size < 1024)
+ nf_conntrack_htable_size = 1024;
+ /* Use a max. factor of one by default to keep the average
+ * hash chain length at 2 entries. Each entry has to be added
+ * twice (once for original direction, once for reply).
+ * When a table size is given we use the old value of 8 to
+ * avoid implicit reduction of the max entries setting.
+ */
+ max_factor = 1;
}
nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
@@ -2301,34 +2676,10 @@ int nf_conntrack_init_start(void)
if (ret < 0)
goto err_expect;
- ret = nf_conntrack_acct_init();
- if (ret < 0)
- goto err_acct;
-
- ret = nf_conntrack_tstamp_init();
- if (ret < 0)
- goto err_tstamp;
-
- ret = nf_conntrack_ecache_init();
- if (ret < 0)
- goto err_ecache;
-
- ret = nf_conntrack_timeout_init();
- if (ret < 0)
- goto err_timeout;
-
ret = nf_conntrack_helper_init();
if (ret < 0)
goto err_helper;
- ret = nf_conntrack_labels_init();
- if (ret < 0)
- goto err_labels;
-
- ret = nf_conntrack_seqadj_init();
- if (ret < 0)
- goto err_seqadj;
-
ret = nf_conntrack_proto_init();
if (ret < 0)
goto err_proto;
@@ -2336,23 +2687,18 @@ int nf_conntrack_init_start(void)
conntrack_gc_work_init(&conntrack_gc_work);
queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ);
+ ret = register_nf_conntrack_bpf();
+ if (ret < 0)
+ goto err_kfunc;
+
return 0;
+err_kfunc:
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
+ nf_conntrack_proto_fini();
err_proto:
- nf_conntrack_seqadj_fini();
-err_seqadj:
- nf_conntrack_labels_fini();
-err_labels:
nf_conntrack_helper_fini();
err_helper:
- nf_conntrack_timeout_fini();
-err_timeout:
- nf_conntrack_ecache_fini();
-err_ecache:
- nf_conntrack_tstamp_fini();
-err_tstamp:
- nf_conntrack_acct_fini();
-err_acct:
nf_conntrack_expect_fini();
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
@@ -2361,16 +2707,29 @@ err_cachep:
return ret;
}
-static struct nf_ct_hook nf_conntrack_hook = {
+static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
+{
+ struct nf_conn *ct = nf_ct_to_nf_conn(nfct);
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ nf_conntrack_tcp_set_closing(ct);
+ break;
+ }
+}
+
+static const struct nf_ct_hook nf_conntrack_hook = {
.update = nf_conntrack_update,
- .destroy = destroy_conntrack,
+ .destroy = nf_ct_destroy,
.get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
+ .set_closing = nf_conntrack_set_closing,
+ .confirm = __nf_conntrack_confirm,
+ .get_id = nf_conntrack_get_id,
};
void nf_conntrack_init_end(void)
{
- /* For use by REJECT target */
- RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
}
@@ -2378,32 +2737,19 @@ void nf_conntrack_init_end(void)
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
-#define DYING_NULLS_VAL ((1<<30)+1)
-#define TEMPLATE_NULLS_VAL ((1<<30)+2)
int nf_conntrack_init_net(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
int ret = -ENOMEM;
- int cpu;
BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
- atomic_set(&net->ct.count, 0);
-
- net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
- if (!net->ct.pcpu_lists)
- goto err_stat;
-
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_init(&pcpu->lock);
- INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- }
+ BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
+ atomic_set(&cnet->count, 0);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat)
- goto err_pcpu_lists;
+ return ret;
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
@@ -2412,20 +2758,67 @@ int nf_conntrack_init_net(struct net *net)
nf_conntrack_acct_pernet_init(net);
nf_conntrack_tstamp_pernet_init(net);
nf_conntrack_ecache_pernet_init(net);
- nf_conntrack_helper_pernet_init(net);
+ nf_conntrack_proto_pernet_init(net);
- ret = nf_conntrack_proto_pernet_init(net);
- if (ret < 0)
- goto err_proto;
return 0;
-err_proto:
- nf_conntrack_ecache_pernet_fini(net);
- nf_conntrack_expect_pernet_fini(net);
err_expect:
free_percpu(net->ct.stat);
-err_pcpu_lists:
- free_percpu(net->ct.pcpu_lists);
-err_stat:
return ret;
}
+
+/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
+
+int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
+{
+ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return -EPERM;
+
+ __nf_ct_set_timeout(ct, timeout);
+
+ if (test_bit(IPS_DYING_BIT, &ct->status))
+ return -ETIME;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
+
+void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off)
+{
+ unsigned int bit;
+
+ /* Ignore these unchangable bits */
+ on &= ~IPS_UNCHANGEABLE_MASK;
+ off &= ~IPS_UNCHANGEABLE_MASK;
+
+ for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+ if (on & (1 << bit))
+ set_bit(bit, &ct->status);
+ else if (off & (1 << bit))
+ clear_bit(bit, &ct->status);
+ }
+}
+EXPORT_SYMBOL_GPL(__nf_ct_change_status);
+
+int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status)
+{
+ unsigned long d;
+
+ d = ct->status ^ status;
+
+ if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
+ /* unchangeable */
+ return -EBUSY;
+
+ if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
+ /* SEEN_REPLY bit can only be set */
+ return -EBUSY;
+
+ if (d & IPS_ASSURED && !(status & IPS_ASSURED))
+ /* ASSURED bit can only be set */
+ return -EBUSY;
+
+ __nf_ct_change_status(ct, status, 0);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 3d042f8ff183..81baf2082604 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Event cache for netfilter. */
/*
@@ -5,10 +6,6 @@
* (C) 2005 Patrick McHardy <kaber@trash.net>
* (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -19,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
-#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -27,11 +23,14 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-#define ECACHE_RETRY_WAIT (HZ/10)
+#define DYING_NULLS_VAL ((1 << 30) + 1)
+#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
+#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
@@ -39,138 +38,173 @@ enum retry_state {
STATE_DONE,
};
-static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
- struct nf_conn *refs[16];
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return &cnet->ecache;
+}
+#if IS_MODULE(CONFIG_NF_CT_NETLINK)
+EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
+#endif
+
+static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
+{
+ unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
+ struct hlist_nulls_head evicted_list;
+ enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int evicted = 0;
- enum retry_state ret = STATE_DONE;
+ unsigned int sent;
- spin_lock(&pcpu->lock);
-
- hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- struct nf_conntrack_ecache *e;
+ INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
- if (!nf_ct_is_confirmed(ct))
- continue;
+next:
+ sent = 0;
+ spin_lock_bh(&cnet->ecache.dying_lock);
- e = nf_ct_ecache_find(ct);
- if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
- continue;
+ hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ /* The worker owns all entries, ct remains valid until nf_ct_put
+ * in the loop below.
+ */
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
- e->state = NFCT_ECACHE_DESTROY_SENT;
- refs[evicted] = ct;
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
- if (++evicted >= ARRAY_SIZE(refs)) {
+ if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
+
+ if (sent++ > 16) {
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+ cond_resched();
+ goto next;
+ }
}
- spin_unlock(&pcpu->lock);
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+
+ hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+ nf_ct_put(ct);
- /* can't _put while holding lock */
- while (evicted)
- nf_ct_put(refs[--evicted]);
+ cond_resched();
+ }
return ret;
}
static void ecache_work(struct work_struct *work)
{
- struct netns_ct *ctnet =
- container_of(work, struct netns_ct, ecache_dwork.work);
- int cpu, delay = -1;
- struct ct_pcpu *pcpu;
+ struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
+ int ret, delay = -1;
+
+ ret = ecache_work_evict_list(cnet);
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_JIFFIES;
+ break;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
+ }
- local_bh_disable();
+ if (delay >= 0)
+ schedule_delayed_work(&cnet->ecache.dwork, delay);
+}
- for_each_possible_cpu(cpu) {
- enum retry_state ret;
+static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
+ const u32 events,
+ const u32 missed,
+ const struct nf_ct_event *item)
+{
+ struct net *net = nf_ct_net(item->ct);
+ struct nf_ct_event_notifier *notify;
+ u32 old, want;
+ int ret;
- pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
+ if (!((events | missed) & e->ctmask))
+ return 0;
- ret = ecache_work_evict_list(pcpu);
+ rcu_read_lock();
- switch (ret) {
- case STATE_CONGESTED:
- delay = ECACHE_RETRY_WAIT;
- goto out;
- case STATE_RESTART:
- delay = 0;
- break;
- case STATE_DONE:
- break;
- }
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
+ if (!notify) {
+ rcu_read_unlock();
+ return 0;
}
- out:
- local_bh_enable();
+ ret = notify->ct_event(events | missed, item);
+ rcu_read_unlock();
+
+ if (likely(ret >= 0 && missed == 0))
+ return 0;
- ctnet->ecache_dwork_pending = delay > 0;
- if (delay >= 0)
- schedule_delayed_work(&ctnet->ecache_dwork, delay);
+ do {
+ old = READ_ONCE(e->missed);
+ if (ret < 0)
+ want = old | events;
+ else
+ want = old & ~missed;
+ } while (cmpxchg(&e->missed, old, want) != old);
+
+ return ret;
}
-int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (local64_read(&e->timestamp))
+ local64_set(&e->timestamp, ktime_get_real_ns());
+#endif
+}
+
+int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
- int ret = 0;
- struct net *net = nf_ct_net(ct);
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ unsigned int missed;
+ int ret;
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (!notify)
- goto out_unlock;
+ if (!nf_ct_is_confirmed(ct))
+ return 0;
e = nf_ct_ecache_find(ct);
if (!e)
- goto out_unlock;
+ return 0;
- if (nf_ct_is_confirmed(ct)) {
- struct nf_ct_event item = {
- .ct = ct,
- .portid = e->portid ? e->portid : portid,
- .report = report
- };
- /* This is a resent of a destroy event? If so, skip missed */
- unsigned long missed = e->portid ? 0 : e->missed;
-
- if (!((eventmask | missed) & e->ctmask))
- goto out_unlock;
-
- ret = notify->fcn(eventmask | missed, &item);
- if (unlikely(ret < 0 || missed)) {
- spin_lock_bh(&ct->lock);
- if (ret < 0) {
- /* This is a destroy event that has been
- * triggered by a process, we store the PORTID
- * to include it in the retransmission.
- */
- if (eventmask & (1 << IPCT_DESTROY)) {
- if (e->portid == 0 && portid != 0)
- e->portid = portid;
- e->state = NFCT_ECACHE_DESTROY_FAIL;
- } else {
- e->missed |= eventmask;
- }
- } else {
- e->missed &= ~missed;
- }
- spin_unlock_bh(&ct->lock);
- }
+ memset(&item, 0, sizeof(item));
+
+ item.ct = ct;
+ item.portid = e->portid ? e->portid : portid;
+ item.report = report;
+
+ /* This is a resent of a destroy event? If so, skip missed */
+ missed = e->portid ? 0 : e->missed;
+
+ nf_ct_ecache_tstamp_refresh(e);
+
+ ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
+ if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
+ /* This is a destroy event that has been triggered by a process,
+ * we store the PORTID to include it in the retransmission.
+ */
+ if (e->portid == 0 && portid != 0)
+ e->portid = portid;
}
-out_unlock:
- rcu_read_unlock();
+
return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
@@ -179,53 +213,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
* disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
- unsigned long events, missed;
- struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
struct nf_ct_event item;
- int ret;
+ unsigned int events;
- rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (notify == NULL)
- goto out_unlock;
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
+ return;
e = nf_ct_ecache_find(ct);
if (e == NULL)
- goto out_unlock;
+ return;
events = xchg(&e->cache, 0);
- if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
- goto out_unlock;
-
- /* We make a copy of the missed event cache without taking
- * the lock, thus we may send missed events twice. However,
- * this does not harm and it happens very rarely. */
- missed = e->missed;
-
- if (!((events | missed) & e->ctmask))
- goto out_unlock;
-
item.ct = ct;
item.portid = 0;
item.report = 0;
- ret = notify->fcn(events | missed, &item);
-
- if (likely(ret == 0 && !missed))
- goto out_unlock;
-
- spin_lock_bh(&ct->lock);
- if (ret < 0)
- e->missed |= events;
- else
- e->missed &= ~missed;
- spin_unlock_bh(&ct->lock);
-
-out_unlock:
- rcu_read_unlock();
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely.
+ */
+ __nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
@@ -235,11 +244,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
{
struct net *net = nf_ct_exp_net(exp);
- struct nf_exp_event_notifier *notify;
+ struct nf_ct_event_notifier *notify;
struct nf_conntrack_ecache *e;
rcu_read_lock();
- notify = rcu_dereference(net->ct.nf_expect_event_cb);
+ notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
if (!notify)
goto out_unlock;
@@ -253,118 +262,120 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
.portid = portid,
.report = report
};
- notify->fcn(1 << event, &item);
+ notify->exp_event(1 << event, &item);
}
out_unlock:
rcu_read_unlock();
}
-int nf_conntrack_register_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_register_notifier(struct net *net,
+ const struct nf_ct_event_notifier *new)
{
- int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ WARN_ON_ONCE(notify);
rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
- ret = 0;
-
-out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
-void nf_conntrack_unregister_notifier(struct net *net,
- struct nf_ct_event_notifier *new)
+void nf_conntrack_unregister_notifier(struct net *net)
{
- struct nf_ct_event_notifier *notify;
-
mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ /* synchronize_rcu() is called after netns pre_exit */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
-int nf_ct_expect_register_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
{
- int ret;
- struct nf_exp_event_notifier *notify;
-
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- if (notify != NULL) {
- ret = -EBUSY;
- goto out_unlock;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ if (state == NFCT_ECACHE_DESTROY_FAIL &&
+ !delayed_work_pending(&cnet->ecache.dwork)) {
+ schedule_delayed_work(&cnet->ecache.dwork, HZ);
+ net->ct.ecache_dwork_pending = true;
+ } else if (state == NFCT_ECACHE_DESTROY_SENT) {
+ if (!hlist_nulls_empty(&cnet->ecache.dying_list))
+ mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0);
+ else
+ net->ct.ecache_dwork_pending = false;
}
- rcu_assign_pointer(net->ct.nf_expect_event_cb, new);
- ret = 0;
-
-out_unlock:
- mutex_unlock(&nf_ct_ecache_mutex);
- return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
-void nf_ct_expect_unregister_notifier(struct net *net,
- struct nf_exp_event_notifier *new)
+static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e)
{
- struct nf_exp_event_notifier *notify;
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ u64 ts = 0;
- mutex_lock(&nf_ct_ecache_mutex);
- notify = rcu_dereference_protected(net->ct.nf_expect_event_cb,
- lockdep_is_held(&nf_ct_ecache_mutex));
- BUG_ON(notify != new);
- RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL);
- mutex_unlock(&nf_ct_ecache_mutex);
- /* synchronize_rcu() is called from ctnetlink_exit. */
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ ts = ktime_get_real_ns();
+
+ local64_set(&e->timestamp, ts);
+#endif
}
-EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
-#define NF_CT_EVENTS_DEFAULT 1
-static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *e;
-static const struct nf_ct_ext_type event_extend = {
- .len = sizeof(struct nf_conntrack_ecache),
- .align = __alignof__(struct nf_conntrack_ecache),
- .id = NF_CT_EXT_ECACHE,
-};
+ switch (net->ct.sysctl_events) {
+ case 0:
+ /* assignment via template / ruleset? ignore sysctl. */
+ if (ctmask || expmask)
+ break;
+ return true;
+ case 2: /* autodetect: no event listener, don't allocate extension. */
+ if (!READ_ONCE(nf_ctnetlink_has_listener))
+ return true;
+ fallthrough;
+ case 1:
+ /* always allocate an extension. */
+ if (!ctmask && !expmask) {
+ ctmask = ~0;
+ expmask = ~0;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return true;
+ }
-void nf_conntrack_ecache_pernet_init(struct net *net)
-{
- net->ct.sysctl_events = nf_ct_events;
- INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work);
-}
+ e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
+ if (e) {
+ nf_ct_ecache_tstamp_new(ct, e);
+ e->ctmask = ctmask;
+ e->expmask = expmask;
+ }
-void nf_conntrack_ecache_pernet_fini(struct net *net)
-{
- cancel_delayed_work_sync(&net->ct.ecache_dwork);
+ return e != NULL;
}
+EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
-int nf_conntrack_ecache_init(void)
+#define NF_CT_EVENTS_DEFAULT 2
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+
+void nf_conntrack_ecache_pernet_init(struct net *net)
{
- int ret = nf_ct_extend_register(&event_extend);
- if (ret < 0)
- pr_err("Unable to register event extension\n");
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ net->ct.sysctl_events = nf_ct_events;
- BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */
+ INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
+ INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
+ spin_lock_init(&cnet->ecache.dying_lock);
- return ret;
+ BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
-void nf_conntrack_ecache_fini(void)
+void nf_conntrack_ecache_pernet_fini(struct net *net)
{
- nf_ct_extend_unregister(&event_extend);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ cancel_delayed_work_sync(&cnet->ecache.dwork);
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 3034038bfdf0..cfc2daa3fc7f 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Expectation handling for nf_conntrack. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
* (c) 2005-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -20,7 +17,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
@@ -28,8 +25,10 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_zones.h>
@@ -42,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
-static unsigned int nf_ct_expect_hashrnd __read_mostly;
+static siphash_aligned_key_t nf_ct_expect_hashrnd;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@@ -50,12 +49,15 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
+ struct nf_conntrack_net *cnet;
WARN_ON(!master_help);
WARN_ON(timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
- net->ct.expect_count--;
+
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count--;
hlist_del_rcu(&exp->lnode);
master_help->expecting[exp->class]--;
@@ -69,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
static void nf_ct_expectation_timed_out(struct timer_list *t)
{
- struct nf_conntrack_expect *exp = from_timer(exp, t, timeout);
+ struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout);
spin_lock_bh(&nf_conntrack_expect_lock);
nf_ct_unlink_expect(exp);
@@ -79,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t)
static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- unsigned int hash, seed;
+ struct {
+ union nf_inet_addr dst_addr;
+ u32 net_mix;
+ u16 dport;
+ u8 l3num;
+ u8 protonum;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
+ u32 hash;
get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
- seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
+ memset(&combined, 0, sizeof(combined));
+
+ combined.dst_addr = tuple->dst.u3;
+ combined.net_mix = net_hash_mix(n);
+ combined.dport = (__force __u16)tuple->dst.u.all;
+ combined.l3num = tuple->src.l3num;
+ combined.protonum = tuple->dst.protonum;
- hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
- (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
- (__force __u16)tuple->dst.u.all) ^ seed);
+ hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
@@ -105,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
bool nf_ct_remove_expect(struct nf_conntrack_expect *exp)
{
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
return true;
@@ -119,10 +132,11 @@ __nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -157,12 +171,13 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
struct nf_conntrack_expect *
nf_ct_find_expectation(struct net *net,
const struct nf_conntrack_zone *zone,
- const struct nf_conntrack_tuple *tuple)
+ const struct nf_conntrack_tuple *tuple, bool unlink)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
struct nf_conntrack_expect *i, *exp = NULL;
unsigned int h;
- if (!net->ct.expect_count)
+ if (!cnet->expect_count)
return NULL;
h = nf_ct_expect_dst_hash(net, tuple);
@@ -188,22 +203,22 @@ nf_ct_find_expectation(struct net *net,
* about to invoke ->destroy(), or nf_ct_delete() via timeout
* or early_drop().
*
- * The atomic_inc_not_zero() check tells: If that fails, we
+ * The refcount_inc_not_zero() check tells: If that fails, we
* know that the ct is being destroyed. If it succeeds, we
* can be sure the ct cannot disappear underneath.
*/
if (unlikely(nf_ct_is_dying(exp->master) ||
- !atomic_inc_not_zero(&exp->master->ct_general.use)))
+ !refcount_inc_not_zero(&exp->master->ct_general.use)))
return NULL;
- if (exp->flags & NF_CT_EXPECT_PERMANENT) {
+ if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) {
refcount_inc(&exp->use);
return exp;
- } else if (del_timer(&exp->timeout)) {
+ } else if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
return exp;
}
- /* Undo exp->master refcnt increase, if del_timer() failed */
+ /* Undo exp->master refcnt increase, if timer_delete() failed */
nf_ct_put(exp->master);
return NULL;
@@ -252,13 +267,22 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
static inline int expect_matches(const struct nf_conntrack_expect *a,
const struct nf_conntrack_expect *b)
{
- return a->master == b->master &&
- nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
+ return nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
+static bool master_matches(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_expect *b,
+ unsigned int flags)
+{
+ if (flags & NF_CT_EXP_F_SKIP_MASTER)
+ return true;
+
+ return a->master == b->master;
+}
+
/* Generally a bad idea to call this: could have matched already. */
void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
{
@@ -336,7 +360,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
exp->tuple.dst.u.all = *dst;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
#endif
@@ -360,6 +384,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put);
static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
+ struct nf_conntrack_net *cnet;
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
@@ -381,7 +406,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
- net->ct.expect_count++;
+ cnet = nf_ct_pernet(net);
+ cnet->expect_count++;
NF_CT_STAT_INC(net, expect_create);
}
@@ -402,10 +428,12 @@ static void evict_oldest_expect(struct nf_conn *master,
nf_ct_remove_expect(last);
}
-static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
+static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+ unsigned int flags)
{
const struct nf_conntrack_expect_policy *p;
struct nf_conntrack_expect *i;
+ struct nf_conntrack_net *cnet;
struct nf_conn *master = expect->master;
struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_helper *helper;
@@ -420,8 +448,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
h = nf_ct_expect_dst_hash(net, &expect->tuple);
hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
- if (expect_matches(i, expect)) {
- if (i->class != expect->class)
+ if (master_matches(i, expect, flags) &&
+ expect_matches(i, expect)) {
+ if (i->class != expect->class ||
+ i->master != expect->master)
return -EALREADY;
if (nf_ct_remove_expect(i))
@@ -447,7 +477,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
}
- if (net->ct.expect_count >= nf_ct_expect_max) {
+ cnet = nf_ct_pernet(net);
+ if (cnet->expect_count >= nf_ct_expect_max) {
net_warn_ratelimited("nf_conntrack: expectation table full\n");
ret = -EMFILE;
}
@@ -456,12 +487,12 @@ out:
}
int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
- u32 portid, int report)
+ u32 portid, int report, unsigned int flags)
{
int ret;
spin_lock_bh(&nf_conntrack_expect_lock);
- ret = __nf_ct_expect_check(expect);
+ ret = __nf_ct_expect_check(expect, flags);
if (ret < 0)
goto out;
@@ -489,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo
hlist_for_each_entry_safe(exp, next,
&nf_ct_expect_hash[i],
hnode) {
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
}
@@ -519,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net,
if (!net_eq(nf_ct_exp_net(exp), net))
continue;
- if (iter(exp, data) && del_timer(&exp->timeout)) {
+ if (iter(exp, data) && timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, portid, report);
nf_ct_expect_put(exp);
}
@@ -610,7 +641,7 @@ static int exp_seq_show(struct seq_file *s, void *v)
expect->tuple.src.l3num,
expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- __nf_ct_l4proto_find(expect->tuple.dst.protonum));
+ nf_ct_l4proto_find(expect->tuple.dst.protonum));
if (expect->flags & NF_CT_EXPECT_PERMANENT) {
seq_puts(s, "PERMANENT");
@@ -675,7 +706,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
- net->ct.expect_count = 0;
return exp_proc_init(net);
}
@@ -692,9 +722,7 @@ int nf_conntrack_expect_init(void)
nf_ct_expect_hsize = 1;
}
nf_ct_expect_max = nf_ct_expect_hsize * 4;
- nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
- sizeof(struct nf_conntrack_expect),
- 0, 0, NULL);
+ nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0);
if (!nf_ct_expect_cachep)
return -ENOMEM;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 277bbfe26478..dd62cc12e775 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Structure dynamic extension infrastructure
* Copyright (C) 2004 Rusty Russell IBM Corporation
* Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org>
* Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/kmemleak.h>
@@ -17,42 +13,96 @@
#include <linux/skbuff.h>
#include <net/netfilter/nf_conntrack_extend.h>
-static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
-static DEFINE_MUTEX(nf_ct_ext_type_mutex);
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_timestamp.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_nat.h>
+
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
-void nf_ct_ext_destroy(struct nf_conn *ct)
+atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
+
+static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
+ [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
+#if IS_ENABLED(CONFIG_NF_NAT)
+ [NF_CT_EXT_NAT] = sizeof(struct nf_conn_nat),
+#endif
+ [NF_CT_EXT_SEQADJ] = sizeof(struct nf_conn_seqadj),
+ [NF_CT_EXT_ACCT] = sizeof(struct nf_conn_acct),
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ [NF_CT_EXT_ECACHE] = sizeof(struct nf_conntrack_ecache),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ [NF_CT_EXT_TSTAMP] = sizeof(struct nf_conn_tstamp),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ [NF_CT_EXT_TIMEOUT] = sizeof(struct nf_conn_timeout),
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ [NF_CT_EXT_LABELS] = sizeof(struct nf_conn_labels),
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ [NF_CT_EXT_SYNPROXY] = sizeof(struct nf_conn_synproxy),
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ [NF_CT_EXT_ACT_CT] = sizeof(struct nf_conn_act_ct_ext),
+#endif
+};
+
+static __always_inline unsigned int total_extension_size(void)
{
- unsigned int i;
- struct nf_ct_ext_type *t;
-
- for (i = 0; i < NF_CT_EXT_NUM; i++) {
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[i]);
-
- /* Here the nf_ct_ext_type might have been unregisterd.
- * I.e., it has responsible to cleanup private
- * area in all conntracks when it is unregisterd.
- */
- if (t && t->destroy)
- t->destroy(ct);
- rcu_read_unlock();
- }
+ /* remember to add new extensions below */
+ BUILD_BUG_ON(NF_CT_EXT_NUM > 10);
+
+ return sizeof(struct nf_ct_ext) +
+ sizeof(struct nf_conn_help)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ + sizeof(struct nf_conn_nat)
+#endif
+ + sizeof(struct nf_conn_seqadj)
+ + sizeof(struct nf_conn_acct)
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ + sizeof(struct nf_conntrack_ecache)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + sizeof(struct nf_conn_tstamp)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ + sizeof(struct nf_conn_timeout)
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ + sizeof(struct nf_conn_labels)
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+ + sizeof(struct nf_conn_synproxy)
+#endif
+#if IS_ENABLED(CONFIG_NET_ACT_CT)
+ + sizeof(struct nf_conn_act_ct_ext)
+#endif
+ ;
}
-EXPORT_SYMBOL(nf_ct_ext_destroy);
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
{
unsigned int newlen, newoff, oldlen, alloc;
- struct nf_ct_ext *old, *new;
- struct nf_ct_ext_type *t;
+ struct nf_ct_ext *new;
/* Conntrack must not be confirmed to avoid races on reallocation. */
WARN_ON(nf_ct_is_confirmed(ct));
- old = ct->ext;
+ /* struct nf_ct_ext uses u8 to store offsets/size */
+ BUILD_BUG_ON(total_extension_size() > 255u);
+
+ if (ct->ext) {
+ const struct nf_ct_ext *old = ct->ext;
- if (old) {
if (__nf_ct_ext_exist(old, id))
return NULL;
oldlen = old->len;
@@ -60,62 +110,50 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
oldlen = sizeof(*new);
}
- rcu_read_lock();
- t = rcu_dereference(nf_ct_ext_types[id]);
- if (!t) {
- rcu_read_unlock();
- return NULL;
- }
-
- newoff = ALIGN(oldlen, t->align);
- newlen = newoff + t->len;
- rcu_read_unlock();
+ newoff = ALIGN(oldlen, __alignof__(struct nf_ct_ext));
+ newlen = newoff + nf_ct_ext_type_len[id];
alloc = max(newlen, NF_CT_EXT_PREALLOC);
- kmemleak_not_leak(old);
- new = __krealloc(old, alloc, gfp);
+ new = krealloc(ct->ext, alloc, gfp);
if (!new)
return NULL;
- if (!old) {
+ if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
- ct->ext = new;
- } else if (new != old) {
- kfree_rcu(old, rcu);
- rcu_assign_pointer(ct->ext, new);
+ new->gen_id = atomic_read(&nf_conntrack_ext_genid);
}
new->offset[id] = newoff;
new->len = newlen;
memset((void *)new + newoff, 0, newlen - newoff);
+
+ ct->ext = new;
return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);
-/* This MUST be called in process context. */
-int nf_ct_extend_register(const struct nf_ct_ext_type *type)
+/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
{
- int ret = 0;
+ unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
+ unsigned int this_id = READ_ONCE(ext->gen_id);
- mutex_lock(&nf_ct_ext_type_mutex);
- if (nf_ct_ext_types[type->id]) {
- ret = -EBUSY;
- goto out;
- }
+ if (!__nf_ct_ext_exist(ext, id))
+ return NULL;
- rcu_assign_pointer(nf_ct_ext_types[type->id], type);
-out:
- mutex_unlock(&nf_ct_ext_type_mutex);
- return ret;
+ if (this_id == 0 || ext->gen_id == gen_id)
+ return (void *)ext + ext->offset[id];
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_register);
+EXPORT_SYMBOL(__nf_ct_ext_find);
-/* This MUST be called in process context. */
-void nf_ct_extend_unregister(const struct nf_ct_ext_type *type)
+void nf_ct_ext_bump_genid(void)
{
- mutex_lock(&nf_ct_ext_type_mutex);
- RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL);
- mutex_unlock(&nf_ct_ext_type_mutex);
- synchronize_rcu();
+ unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
+
+ if (value == UINT_MAX)
+ atomic_set(&nf_conntrack_ext_genid, 1);
+
+ msleep(HZ);
}
-EXPORT_SYMBOL_GPL(nf_ct_extend_unregister);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index a11c304fb771..617f744a2e3a 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* FTP extension for connection tracking. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -29,15 +26,13 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <linux/netfilter/nf_conntrack_ftp.h>
+#define HELPER_NAME "ftp"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp connection tracking helper");
MODULE_ALIAS("ip_conntrack_ftp");
-MODULE_ALIAS_NFCT_HELPER("ftp");
-
-/* This is slow, but it's simple. --RR */
-static char *ftp_buffer;
-
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static DEFINE_SPINLOCK(nf_ftp_lock);
#define MAX_PORTS 8
@@ -163,7 +158,7 @@ static int try_rfc959(const char *data, size_t dlen,
if (length == 0)
return 0;
- cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
+ cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) |
(array[2] << 8) | array[3]);
cmd->u.tcp.port = htons((array[4] << 8) | array[5]);
return length;
@@ -323,7 +318,7 @@ static int find_pattern(const char *data, size_t dlen,
i++;
}
- pr_debug("Skipped up to `%c'!\n", skip);
+ pr_debug("Skipped up to 0x%hhx delimiter!\n", skip);
*numoff = i;
*numlen = getnum(data + i, dlen - i, cmd, term, numoff);
@@ -383,7 +378,7 @@ static int help(struct sk_buff *skb,
int ret;
u32 seq;
int dir = CTINFO2DIR(ctinfo);
- unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
+ unsigned int matchlen, matchoff;
struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
union nf_inet_addr *daddr;
@@ -399,6 +394,9 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
}
+ if (unlikely(skb_linearize(skb)))
+ return NF_DROP;
+
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
return NF_ACCEPT;
@@ -412,9 +410,9 @@ static int help(struct sk_buff *skb,
}
datalen = skb->len - dataoff;
+ /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */
spin_lock_bh(&nf_ftp_lock);
- fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
- BUG_ON(fb_ptr == NULL);
+ fb_ptr = skb->data + dataoff;
ends_in_nl = (fb_ptr[datalen - 1] == '\n');
seq = ntohl(th->seq) + datalen;
@@ -526,7 +524,7 @@ skip_nl_seq:
protoff, matchoff, matchlen, exp);
else {
/* Can't expect this? Best to drop packet now. */
- if (nf_ct_expect_related(exp) != 0) {
+ if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
} else
@@ -569,7 +567,6 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = {
static void __exit nf_conntrack_ftp_fini(void)
{
nf_conntrack_helpers_unregister(ftp, ports_c * 2);
- kfree(ftp_buffer);
}
static int __init nf_conntrack_ftp_init(void)
@@ -578,28 +575,25 @@ static int __init nf_conntrack_ftp_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master));
- ftp_buffer = kmalloc(65536, GFP_KERNEL);
- if (!ftp_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = FTP_PORT;
/* FIXME should be configurable whether IPv4 and IPv6 FTP connections
are tracked or not - YK */
for (i = 0; i < ports_c; i++) {
- nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp",
- FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
- 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE);
- nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp",
- FTP_PORT, ports[i], ports[i], &ftp_exp_policy,
- 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE);
+ nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP,
+ HELPER_NAME, FTP_PORT, ports[i], ports[i],
+ &ftp_exp_policy, 0, help,
+ nf_ct_ftp_from_nlattr, THIS_MODULE);
+ nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP,
+ HELPER_NAME, FTP_PORT, ports[i], ports[i],
+ &ftp_exp_policy, 0, help,
+ nf_ct_ftp_from_nlattr, THIS_MODULE);
}
ret = nf_conntrack_helpers_register(ftp, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(ftp_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c
index 1601275efe2d..540d97715bd2 100644
--- a/net/netfilter/nf_conntrack_h323_asn1.c
+++ b/net/netfilter/nf_conntrack_h323_asn1.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323
- * conntrack/NAT module.
+ * BER and PER decoding library for H.323 conntrack/NAT module.
*
* Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net>
*
- * This source code is licensed under General Public License version 2.
- *
- * See ip_conntrack_helper_h323_asn1.h for details.
- *
+ * See nf_conntrack_helper_h323_asn1.h for details.
*/
#ifdef __KERNEL__
@@ -172,7 +169,7 @@ static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits)
if (bits % BITS_PER_BYTE > 0)
bytes++;
- if (*bs->cur + bytes > *bs->end)
+ if (bs->cur + bytes > bs->end)
return 1;
return 0;
@@ -260,15 +257,15 @@ static unsigned int get_uint(struct bitstr *bs, int b)
case 4:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 3:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 2:
v |= *bs->cur++;
v <<= 8;
- /* fall through */
+ fallthrough;
case 1:
v |= *bs->cur++;
break;
@@ -536,6 +533,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
/* Get fields bitmap */
if (nf_h323_error_boundary(bs, 0, f->sz))
return H323_ERROR_BOUND;
+ if (f->sz > 32)
+ return H323_ERROR_RANGE;
bmp = get_bitmap(bs, f->sz);
if (base)
*(unsigned int *)base = bmp;
@@ -592,6 +591,8 @@ static int decode_seq(struct bitstr *bs, const struct field_t *f,
bmp2_len = get_bits(bs, 7) + 1;
if (nf_h323_error_boundary(bs, 0, bmp2_len))
return H323_ERROR_BOUND;
+ if (bmp2_len > 32)
+ return H323_ERROR_RANGE;
bmp2 = get_bitmap(bs, bmp2_len);
bmp |= bmp2 >> f->sz;
if (base)
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 005589c6d0f6..14f73872f647 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1,13 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* H.323 connection tracking helper
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
- * This source code is licensed under General Public License version 2.
- *
* Based on the 'brute force' H.323 connection tracking module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*
* For more information, please see http://nath323.sourceforge.net/
*/
@@ -35,6 +34,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_h323.h>
+#define H323_MAX_SIZE 65535
+
/* Parameters */
static unsigned int default_rrq_ttl __read_mostly = 300;
module_param(default_rrq_ttl, uint, 0600);
@@ -50,64 +51,8 @@ MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations "
"if both endpoints are on different sides "
"(determined by routing information)");
-/* Hooks for NAT */
-int (*set_h245_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_h225_addr_hook) (struct sk_buff *skb, unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr,
- union nf_inet_addr *addr, __be16 port)
- __read_mostly;
-int (*set_sig_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*set_ras_addr_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff, unsigned char **data,
- TransportAddress *taddr, int count) __read_mostly;
-int (*nat_rtp_rtcp_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr,
- __be16 port, __be16 rtp_port,
- struct nf_conntrack_expect *rtp_exp,
- struct nf_conntrack_expect *rtcp_exp) __read_mostly;
-int (*nat_t120_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- H245_TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_h245_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_callforwarding_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, int dataoff,
- TransportAddress *taddr, __be16 port,
- struct nf_conntrack_expect *exp) __read_mostly;
-int (*nat_q931_hook) (struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int protoff,
- unsigned char **data, TransportAddress *taddr, int idx,
- __be16 port, struct nf_conntrack_expect *exp)
- __read_mostly;
+const struct nfct_h323_nat_hooks __rcu *nfct_h323_nat_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nfct_h323_nat_hook);
static DEFINE_SPINLOCK(nf_h323_lock);
static char *h323_buffer;
@@ -143,11 +88,15 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen <= 0) /* No TCP data */
goto clear_out;
+ if (tcpdatalen > H323_MAX_SIZE)
+ tcpdatalen = H323_MAX_SIZE;
+
if (*data == NULL) { /* first TPKT */
/* Get first TPKT pointer */
tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen,
h323_buffer);
- BUG_ON(tpkt == NULL);
+ if (!tpkt)
+ goto clear_out;
/* Validate TPKT identifier */
if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) {
@@ -194,7 +143,7 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
if (tcpdatalen == 4) { /* Separate TPKT header */
/* Netmeeting sends TPKT header and data separately */
pr_debug("nf_ct_h323: separate TPKT header indicates "
- "there will be TPKT data of %hu bytes\n",
+ "there will be TPKT data of %d bytes\n",
tpktlen - 4);
info->tpkt_len[dir] = tpktlen - 4;
return 0;
@@ -259,6 +208,7 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
@@ -266,7 +216,6 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
union nf_inet_addr addr;
struct nf_conntrack_expect *rtp_exp;
struct nf_conntrack_expect *rtcp_exp;
- typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp;
/* Read RTP or RTCP address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -296,18 +245,19 @@ static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
&ct->tuplehash[!dir].tuple.dst.u3,
IPPROTO_UDP, NULL, &rtcp_port);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
- taddr, port, rtp_port, rtp_exp, rtcp_exp);
+ ret = nathook->nat_rtp_rtcp(skb, ct, ctinfo, protoff, data, dataoff,
+ taddr, port, rtp_port, rtp_exp, rtcp_exp);
} else { /* Conntrack only */
- if (nf_ct_expect_related(rtp_exp) == 0) {
- if (nf_ct_expect_related(rtcp_exp) == 0) {
+ if (nf_ct_expect_related(rtp_exp, 0) == 0) {
+ if (nf_ct_expect_related(rtcp_exp, 0) == 0) {
pr_debug("nf_ct_h323: expect RTP ");
nf_ct_dump_tuple(&rtp_exp->tuple);
pr_debug("nf_ct_h323: expect RTCP ");
@@ -333,12 +283,12 @@ static int expect_t120(struct sk_buff *skb,
unsigned char **data, int dataoff,
H245_TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_t120_hook) nat_t120;
/* Read T.120 address */
if (!get_h245_addr(ct, *data, taddr, &addr, &port) ||
@@ -355,17 +305,18 @@ static int expect_t120(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_t120 = rcu_dereference(nat_t120_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_t120(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_t120(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_h323: expect T.120 ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -664,18 +615,19 @@ int get_h225_addr(struct nf_conn *ct, unsigned char *data,
return 1;
}
+EXPORT_SYMBOL_GPL(get_h225_addr);
static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int protoff, unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_h245_hook) nat_h245;
/* Read h245Address */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) ||
@@ -692,17 +644,18 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct,
IPPROTO_TCP, NULL, &port);
exp->helper = &nf_conntrack_helper_h245;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_h245 = rcu_dereference(nat_h245_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* NAT needed */
- ret = nat_h245(skb, ct, ctinfo, protoff, data, dataoff, taddr,
- port, exp);
+ ret = nathook->nat_h245(skb, ct, ctinfo, protoff, data,
+ dataoff, taddr, port, exp);
} else { /* Conntrack only */
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect H.245 ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -748,24 +701,19 @@ static int callforward_do_filter(struct net *net,
}
break;
}
-#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6: {
- const struct nf_ipv6_ops *v6ops;
struct rt6_info *rt1, *rt2;
struct flowi6 fl1, fl2;
- v6ops = nf_get_ipv6_ops();
- if (!v6ops)
- return 0;
-
memset(&fl1, 0, sizeof(fl1));
fl1.daddr = src->in6;
memset(&fl2, 0, sizeof(fl2));
fl2.daddr = dst->in6;
- if (!v6ops->route(net, (struct dst_entry **)&rt1,
+ if (!nf_ip6_route(net, (struct dst_entry **)&rt1,
flowi6_to_flowi(&fl1), false)) {
- if (!v6ops->route(net, (struct dst_entry **)&rt2,
+ if (!nf_ip6_route(net, (struct dst_entry **)&rt2,
flowi6_to_flowi(&fl2), false)) {
if (ipv6_addr_equal(rt6_nexthop(rt1, &fl1.daddr),
rt6_nexthop(rt2, &fl2.daddr)) &&
@@ -790,13 +738,13 @@ static int expect_callforwarding(struct sk_buff *skb,
unsigned char **data, int dataoff,
TransportAddress *taddr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
struct net *net = nf_ct_net(ct);
- typeof(nat_callforwarding_hook) nat_callforwarding;
/* Read alternativeAddress */
if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0)
@@ -820,18 +768,19 @@ static int expect_callforwarding(struct sk_buff *skb,
IPPROTO_TCP, NULL, &port);
exp->helper = nf_conntrack_helper_q931;
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if (memcmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3,
sizeof(ct->tuplehash[dir].tuple.src.u3)) &&
- (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) &&
+ nathook &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Need NAT */
- ret = nat_callforwarding(skb, ct, ctinfo,
- protoff, data, dataoff,
- taddr, port, exp);
+ ret = nathook->nat_callforwarding(skb, ct, ctinfo,
+ protoff, data, dataoff,
+ taddr, port, exp);
} else { /* Conntrack only */
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_q931: expect Call Forwarding ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -849,12 +798,12 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, int dataoff,
Setup_UUIE *setup)
{
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
int i;
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_q931: Setup\n");
@@ -865,9 +814,9 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
return -1;
}
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
if ((setup->options & eSetup_UUIE_destCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->destCallSignalAddress,
&addr, &port) &&
@@ -875,16 +824,16 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3,
ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.src.u3,
- ct->tuplehash[!dir].tuple.src.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ ct->tuplehash[!dir].tuple.src.u.tcp.port);
if (ret < 0)
return -1;
}
if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) &&
- (set_h225_addr) && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK &&
get_h225_addr(ct, *data, &setup->sourceCallSignalAddress,
&addr, &port) &&
@@ -892,10 +841,10 @@ static int process_setup(struct sk_buff *skb, struct nf_conn *ct,
pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n",
&addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3,
ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port));
- ret = set_h225_addr(skb, protoff, data, dataoff,
- &setup->sourceCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- ct->tuplehash[!dir].tuple.dst.u.tcp.port);
+ ret = nathook->set_h225_addr(skb, protoff, data, dataoff,
+ &setup->sourceCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ ct->tuplehash[!dir].tuple.dst.u.tcp.port);
if (ret < 0)
return -1;
}
@@ -1225,6 +1174,9 @@ static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NULL;
*datalen = skb->len - dataoff;
+ if (*datalen > H323_MAX_SIZE)
+ *datalen = H323_MAX_SIZE;
+
return skb_header_pointer(skb, dataoff, *datalen, h323_buffer);
}
@@ -1254,13 +1206,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
TransportAddress *taddr, int count)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret = 0;
int i;
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(nat_q931_hook) nat_q931;
/* Look for the first related address */
for (i = 0; i < count; i++) {
@@ -1284,13 +1236,13 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
exp->helper = nf_conntrack_helper_q931;
exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */
- nat_q931 = rcu_dereference(nat_q931_hook);
- if (nat_q931 && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) { /* Need NAT */
- ret = nat_q931(skb, ct, ctinfo, protoff, data,
- taddr, i, port, exp);
+ ret = nathook->nat_q931(skb, ct, ctinfo, protoff, data,
+ taddr, i, port, exp);
} else { /* Conntrack only */
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
nf_ct_dump_tuple(&exp->tuple);
@@ -1310,15 +1262,15 @@ static int process_grq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, GatekeeperRequest *grq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: GRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) /* NATed */
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &grq->rasAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &grq->rasAddress, 1);
return 0;
}
@@ -1355,7 +1307,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct,
IPPROTO_UDP, NULL, &port);
exp->helper = nf_conntrack_helper_ras;
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect RAS ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -1372,8 +1324,8 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationRequest *rrq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
pr_debug("nf_ct_ras: RRQ\n");
@@ -1383,12 +1335,12 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
if (ret < 0)
return -1;
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- rrq->rasAddress.item,
- rrq->rasAddress.count);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ rrq->rasAddress.item,
+ rrq->rasAddress.count);
if (ret < 0)
return -1;
}
@@ -1408,19 +1360,19 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, RegistrationConfirm *rcf)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: RCF\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- rcf->callSignalAddress.item,
- rcf->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ rcf->callSignalAddress.item,
+ rcf->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1433,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
if (info->timeout > 0) {
pr_debug("nf_ct_ras: set RAS connection timeout to "
"%u seconds\n", info->timeout);
- nf_ct_refresh(ct, skb, info->timeout * HZ);
+ nf_ct_refresh(ct, info->timeout * HZ);
/* Set expect timeout */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -1459,18 +1411,18 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, UnregistrationRequest *urq)
{
struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
int ret;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: URQ\n");
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- urq->callSignalAddress.item,
- urq->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ urq->callSignalAddress.item,
+ urq->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1481,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
info->sig_port[!dir] = 0;
/* Give it 30 seconds for UCF or URJ */
- nf_ct_refresh(ct, skb, 30 * HZ);
+ nf_ct_refresh(ct, 30 * HZ);
return 0;
}
@@ -1492,39 +1444,42 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
unsigned char **data, AdmissionRequest *arq)
{
const struct nf_ct_h323_master *info = nfct_help_data(ct);
+ const struct nfct_h323_nat_hooks *nathook;
int dir = CTINFO2DIR(ctinfo);
__be16 port;
union nf_inet_addr addr;
- typeof(set_h225_addr_hook) set_h225_addr;
pr_debug("nf_ct_ras: ARQ\n");
- set_h225_addr = rcu_dereference(set_h225_addr_hook);
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (!nathook)
+ return 0;
+
if ((arq->options & eAdmissionRequest_destCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->destCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
port == info->sig_port[dir] &&
nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- set_h225_addr && ct->status & IPS_NAT_MASK) {
+ ct->status & IPS_NAT_MASK) {
/* Answering ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->destCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- info->sig_port[!dir]);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->destCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ info->sig_port[!dir]);
}
if ((arq->options & eAdmissionRequest_srcCallSignalAddress) &&
get_h225_addr(ct, *data, &arq->srcCallSignalAddress,
&addr, &port) &&
!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) &&
- set_h225_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
/* Calling ARQ */
- return set_h225_addr(skb, protoff, data, 0,
- &arq->srcCallSignalAddress,
- &ct->tuplehash[!dir].tuple.dst.u3,
- port);
+ return nathook->set_h225_addr(skb, protoff, data, 0,
+ &arq->srcCallSignalAddress,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ port);
}
return 0;
@@ -1540,7 +1495,6 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: ACF\n");
@@ -1549,12 +1503,15 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
return 0;
if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) {
+ const struct nfct_h323_nat_hooks *nathook;
+
/* Answering ACF */
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_sig_addr(skb, ct, ctinfo, protoff, data,
- &acf->destCallSignalAddress, 1);
+ return nathook->set_sig_addr(skb, ct, ctinfo, protoff,
+ data,
+ &acf->destCallSignalAddress, 1);
return 0;
}
@@ -1567,7 +1524,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct,
exp->flags = NF_CT_EXPECT_PERMANENT;
exp->helper = nf_conntrack_helper_q931;
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -1583,15 +1540,15 @@ static int process_lrq(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, LocationRequest *lrq)
{
- typeof(set_ras_addr_hook) set_ras_addr;
+ const struct nfct_h323_nat_hooks *nathook;
pr_debug("nf_ct_ras: LRQ\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK)
- return set_ras_addr(skb, ct, ctinfo, protoff, data,
- &lrq->replyAddress, 1);
+ return nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &lrq->replyAddress, 1);
return 0;
}
@@ -1621,7 +1578,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct,
exp->flags = NF_CT_EXPECT_PERMANENT;
exp->helper = nf_conntrack_helper_q931;
- if (nf_ct_expect_related(exp) == 0) {
+ if (nf_ct_expect_related(exp, 0) == 0) {
pr_debug("nf_ct_ras: expect Q.931 ");
nf_ct_dump_tuple(&exp->tuple);
} else
@@ -1639,27 +1596,22 @@ static int process_irr(struct sk_buff *skb, struct nf_conn *ct,
unsigned int protoff,
unsigned char **data, InfoRequestResponse *irr)
{
+ const struct nfct_h323_nat_hooks *nathook;
int ret;
- typeof(set_ras_addr_hook) set_ras_addr;
- typeof(set_sig_addr_hook) set_sig_addr;
pr_debug("nf_ct_ras: IRR\n");
- set_ras_addr = rcu_dereference(set_ras_addr_hook);
- if (set_ras_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
+ nathook = rcu_dereference(nfct_h323_nat_hook);
+ if (nathook && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
ct->status & IPS_NAT_MASK) {
- ret = set_ras_addr(skb, ct, ctinfo, protoff, data,
- &irr->rasAddress, 1);
+ ret = nathook->set_ras_addr(skb, ct, ctinfo, protoff, data,
+ &irr->rasAddress, 1);
if (ret < 0)
return -1;
- }
- set_sig_addr = rcu_dereference(set_sig_addr_hook);
- if (set_sig_addr && nf_ct_l3num(ct) == NFPROTO_IPV4 &&
- ct->status & IPS_NAT_MASK) {
- ret = set_sig_addr(skb, ct, ctinfo, protoff, data,
- irr->callSignalAddress.item,
- irr->callSignalAddress.count);
+ ret = nathook->set_sig_addr(skb, ct, ctinfo, protoff, data,
+ irr->callSignalAddress.item,
+ irr->callSignalAddress.count);
if (ret < 0)
return -1;
}
@@ -1826,7 +1778,7 @@ static int __init nf_conntrack_h323_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master));
- h323_buffer = kmalloc(65536, GFP_KERNEL);
+ h323_buffer = kmalloc(H323_MAX_SIZE + 1, GFP_KERNEL);
if (!h323_buffer)
return -ENOMEM;
ret = h323_helper_init();
@@ -1842,17 +1794,6 @@ err1:
module_init(nf_conntrack_h323_init);
module_exit(nf_conntrack_h323_fini);
-EXPORT_SYMBOL_GPL(get_h225_addr);
-EXPORT_SYMBOL_GPL(set_h245_addr_hook);
-EXPORT_SYMBOL_GPL(set_h225_addr_hook);
-EXPORT_SYMBOL_GPL(set_sig_addr_hook);
-EXPORT_SYMBOL_GPL(set_ras_addr_hook);
-EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook);
-EXPORT_SYMBOL_GPL(nat_t120_hook);
-EXPORT_SYMBOL_GPL(nat_h245_hook);
-EXPORT_SYMBOL_GPL(nat_callforwarding_hook);
-EXPORT_SYMBOL_GPL(nat_q931_hook);
-
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 connection tracking helper");
MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c
index d880f3523c1d..fb1cb67a5a71 100644
--- a/net/netfilter/nf_conntrack_h323_types.c
+++ b/net/netfilter/nf_conntrack_h323_types.c
@@ -1,8 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
- *
- * This source code is licensed under General Public License version 2.
*/
static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 274baf1dab87..ceb48c3ca0a4 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Helper handling for netfilter. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -24,11 +21,14 @@
#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_log.h>
+#include <net/ip.h>
static DEFINE_MUTEX(nf_ct_helper_mutex);
struct hlist_head *nf_ct_helper_hash __read_mostly;
@@ -37,10 +37,8 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
-static bool nf_ct_auto_assign_helper __read_mostly = false;
-module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
-MODULE_PARM_DESC(nf_conntrack_helper,
- "Enable automatic conntrack helper assignment (default 0)");
+static DEFINE_MUTEX(nf_ct_nat_helpers_mutex);
+static struct list_head nf_ct_nat_helpers __read_mostly;
/* Stupid hash, but collision free for the default registrations of the
* helpers currently in the kernel. */
@@ -50,24 +48,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple)
(__force __u16)tuple->src.u.all) % nf_ct_helper_hsize;
}
-static struct nf_conntrack_helper *
-__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
- unsigned int h;
-
- if (!nf_ct_helper_count)
- return NULL;
-
- h = helper_hash(tuple);
- hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) {
- if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask))
- return helper;
- }
- return NULL;
-}
-
struct nf_conntrack_helper *
__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum)
{
@@ -130,6 +110,70 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
+static struct nf_conntrack_nat_helper *
+nf_conntrack_nat_helper_find(const char *mod_name)
+{
+ struct nf_conntrack_nat_helper *cur;
+ bool found = false;
+
+ list_for_each_entry_rcu(cur, &nf_ct_nat_helpers, list) {
+ if (!strcmp(cur->mod_name, mod_name)) {
+ found = true;
+ break;
+ }
+ }
+ return found ? cur : NULL;
+}
+
+int
+nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
+{
+ struct nf_conntrack_helper *h;
+ struct nf_conntrack_nat_helper *nat;
+ char mod_name[NF_CT_HELPER_NAME_LEN];
+ int ret = 0;
+
+ rcu_read_lock();
+ h = __nf_conntrack_helper_find(name, l3num, protonum);
+ if (!h) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ nat = nf_conntrack_nat_helper_find(h->nat_mod_name);
+ if (!nat) {
+ snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name);
+ rcu_read_unlock();
+ request_module("%s", mod_name);
+
+ rcu_read_lock();
+ nat = nf_conntrack_nat_helper_find(mod_name);
+ if (!nat) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ }
+
+ if (!try_module_get(nat->module))
+ ret = -ENOENT;
+
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_helper_try_module_get);
+
+void nf_nat_helper_put(struct nf_conntrack_helper *helper)
+{
+ struct nf_conntrack_nat_helper *nat;
+
+ nat = nf_conntrack_nat_helper_find(helper->nat_mod_name);
+ if (WARN_ON_ONCE(!nat))
+ return;
+
+ module_put(nat->module);
+}
+EXPORT_SYMBOL_GPL(nf_nat_helper_put);
+
struct nf_conn_help *
nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
{
@@ -144,59 +188,31 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add);
-static struct nf_conntrack_helper *
-nf_ct_lookup_helper(struct nf_conn *ct, struct net *net)
-{
- if (!net->ct.sysctl_auto_assign_helper) {
- if (net->ct.auto_assign_helper_warned)
- return NULL;
- if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple))
- return NULL;
- pr_info("nf_conntrack: default automatic helper assignment "
- "has been turned off for security reasons and CT-based "
- " firewall rule not found. Use the iptables CT target "
- "to attach helpers instead.\n");
- net->ct.auto_assign_helper_warned = 1;
- return NULL;
- }
-
- return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-}
-
-
int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
gfp_t flags)
{
struct nf_conntrack_helper *helper = NULL;
struct nf_conn_help *help;
- struct net *net = nf_ct_net(ct);
- /* We already got a helper explicitly attached. The function
- * nf_conntrack_alter_reply - in case NAT is in use - asks for looking
- * the helper up again. Since now the user is in full control of
- * making consistent helper configurations, skip this automatic
- * re-lookup, otherwise we'll lose the helper.
- */
+ /* We already got a helper explicitly attached (e.g. nft_ct) */
if (test_bit(IPS_HELPER_BIT, &ct->status))
return 0;
- if (tmpl != NULL) {
- help = nfct_help(tmpl);
- if (help != NULL) {
- helper = help->helper;
- set_bit(IPS_HELPER_BIT, &ct->status);
- }
+ if (WARN_ON_ONCE(!tmpl))
+ return 0;
+
+ help = nfct_help(tmpl);
+ if (help != NULL) {
+ helper = rcu_dereference(help->helper);
+ set_bit(IPS_HELPER_BIT, &ct->status);
}
help = nfct_help(ct);
if (helper == NULL) {
- helper = nf_ct_lookup_helper(ct, net);
- if (helper == NULL) {
- if (help)
- RCU_INIT_POINTER(help->helper, NULL);
- return 0;
- }
+ if (help)
+ RCU_INIT_POINTER(help->helper, NULL);
+ return 0;
}
if (help == NULL) {
@@ -339,6 +355,9 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
+ if (!nf_ct_helper_hash)
+ return -ENOENT;
+
if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
return -EINVAL;
@@ -349,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
(cur->tuple.src.l3num == NFPROTO_UNSPEC ||
cur->tuple.src.l3num == me->tuple.src.l3num) &&
cur->tuple.dst.protonum == me->tuple.dst.protonum) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -360,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple,
&mask)) {
- ret = -EEXIST;
+ ret = -EBUSY;
goto out;
}
}
@@ -402,11 +421,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
-
- /* Maybe someone has gotten the helper already when unhelp above.
- * So need to wait it.
- */
- synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
@@ -430,6 +444,8 @@ void nf_ct_helper_init(struct nf_conntrack_helper *helper,
helper->help = help;
helper->from_nlattr = from_nlattr;
helper->me = module;
+ snprintf(helper->nat_mod_name, sizeof(helper->nat_mod_name),
+ NF_NAT_HELPER_PREFIX "%s", name);
if (spec_port == default_port)
snprintf(helper->name, sizeof(helper->name), "%s", name);
@@ -466,41 +482,36 @@ void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper,
}
EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister);
-static const struct nf_ct_ext_type helper_extend = {
- .len = sizeof(struct nf_conn_help),
- .align = __alignof__(struct nf_conn_help),
- .id = NF_CT_EXT_HELPER,
-};
+void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat)
+{
+ mutex_lock(&nf_ct_nat_helpers_mutex);
+ list_add_rcu(&nat->list, &nf_ct_nat_helpers);
+ mutex_unlock(&nf_ct_nat_helpers_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_helper_register);
-void nf_conntrack_helper_pernet_init(struct net *net)
+void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat)
{
- net->ct.auto_assign_helper_warned = false;
- net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper;
+ mutex_lock(&nf_ct_nat_helpers_mutex);
+ list_del_rcu(&nat->list);
+ mutex_unlock(&nf_ct_nat_helpers_mutex);
}
+EXPORT_SYMBOL_GPL(nf_nat_helper_unregister);
int nf_conntrack_helper_init(void)
{
- int ret;
nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
nf_ct_helper_hash =
nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
if (!nf_ct_helper_hash)
return -ENOMEM;
- ret = nf_ct_extend_register(&helper_extend);
- if (ret < 0) {
- pr_err("nf_ct_helper: Unable to register helper extension.\n");
- goto out_extend;
- }
-
+ INIT_LIST_HEAD(&nf_ct_nat_helpers);
return 0;
-out_extend:
- kvfree(nf_ct_helper_hash);
- return ret;
}
void nf_conntrack_helper_fini(void)
{
- nf_ct_extend_unregister(&helper_extend);
kvfree(nf_ct_helper_hash);
+ nf_ct_helper_hash = NULL;
}
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 4099f4d79bae..5703846bea3b 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* IRC extension for IP connection tracking, Version 1.21
* (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
* based on RR's ip_conntrack_ftp.c
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -42,11 +38,14 @@ unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb,
struct nf_conntrack_expect *exp) __read_mostly;
EXPORT_SYMBOL_GPL(nf_nat_irc_hook);
+#define HELPER_NAME "irc"
+#define MAX_SEARCH_SIZE 4095
+
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_irc");
-MODULE_ALIAS_NFCT_HELPER("irc");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
module_param_array(ports, ushort, &ports_c, 0400);
MODULE_PARM_DESC(ports, "port numbers of IRC servers");
@@ -123,6 +122,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
int i, ret = NF_ACCEPT;
char *addr_beg_p, *addr_end_p;
typeof(nf_nat_irc_hook) nf_nat_irc;
+ unsigned int datalen;
/* If packet is coming from IRC server */
if (dir == IP_CT_DIR_REPLY)
@@ -142,23 +142,52 @@ static int help(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
+ datalen = skb->len - dataoff;
+ if (datalen > MAX_SEARCH_SIZE)
+ datalen = MAX_SEARCH_SIZE;
+
spin_lock_bh(&irc_buffer_lock);
- ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff,
+ ib_ptr = skb_header_pointer(skb, dataoff, datalen,
irc_buffer);
- BUG_ON(ib_ptr == NULL);
+ if (!ib_ptr) {
+ spin_unlock_bh(&irc_buffer_lock);
+ return NF_ACCEPT;
+ }
data = ib_ptr;
- data_limit = ib_ptr + skb->len - dataoff;
+ data_limit = ib_ptr + datalen;
- /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
- * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
- while (data < data_limit - (19 + MINMATCHLEN)) {
- if (memcmp(data, "\1DCC ", 5)) {
+ /* Skip any whitespace */
+ while (data < data_limit - 10) {
+ if (*data == ' ' || *data == '\r' || *data == '\n')
+ data++;
+ else
+ break;
+ }
+
+ /* strlen("PRIVMSG x ")=10 */
+ if (data < data_limit - 10) {
+ if (strncasecmp("PRIVMSG ", data, 8))
+ goto out;
+ data += 8;
+ }
+
+ /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26
+ * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26
+ */
+ while (data < data_limit - (21 + MINMATCHLEN)) {
+ /* Find first " :", the start of message */
+ if (memcmp(data, " :", 2)) {
data++;
continue;
}
+ data += 2;
+
+ /* then check that place only for the DCC command */
+ if (memcmp(data, "\1DCC ", 5))
+ goto out;
data += 5;
- /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+ /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */
iph = ip_hdr(skb);
pr_debug("DCC found in master %pI4:%u %pI4:%u\n",
@@ -174,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
pr_debug("DCC %s detected\n", dccprotos[i]);
/* we have at least
- * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+ * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid
* data left (== 14/13 bytes) */
if (parse_dcc(data, data_limit, &dcc_ip,
&dcc_port, &addr_beg_p, &addr_end_p)) {
@@ -187,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff,
/* dcc_ip can be the internal OR external (NAT'ed) IP */
tuple = &ct->tuplehash[dir].tuple;
- if (tuple->src.u3.ip != dcc_ip &&
- tuple->dst.u3.ip != dcc_ip) {
+ if ((tuple->src.u3.ip != dcc_ip &&
+ ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) ||
+ dcc_port == 0) {
net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n",
&tuple->src.u3.ip,
&dcc_ip, dcc_port);
@@ -215,7 +245,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
addr_beg_p - ib_ptr,
addr_end_p - addr_beg_p,
exp);
- else if (nf_ct_expect_related(exp) != 0) {
+ else if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct,
"cannot add expectation");
ret = NF_DROP;
@@ -250,7 +280,7 @@ static int __init nf_conntrack_irc_init(void)
irc_exp_policy.max_expected = max_dcc_channels;
irc_exp_policy.timeout = dcc_timeout;
- irc_buffer = kmalloc(65536, GFP_KERNEL);
+ irc_buffer = kmalloc(MAX_SEARCH_SIZE + 1, GFP_KERNEL);
if (!irc_buffer)
return -ENOMEM;
@@ -259,7 +289,7 @@ static int __init nf_conntrack_irc_init(void)
ports[ports_c++] = IRC_PORT;
for (i = 0; i < ports_c; i++) {
- nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc",
+ nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, HELPER_NAME,
IRC_PORT, ports[i], i, &irc_exp_policy,
0, help, NULL, THIS_MODULE);
}
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index adf219859901..6c46aad23313 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* test/set flag bits stored in conntrack extension area.
*
* (C) 2013 Astaro GmbH & Co KG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/export.h>
@@ -14,8 +11,6 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_labels.h>
-static spinlock_t nf_connlabels_lock;
-
static int replace_u32(u32 *address, u32 mask, u32 new)
{
u32 old, tmp;
@@ -63,12 +58,15 @@ EXPORT_SYMBOL_GPL(nf_connlabels_replace);
int nf_connlabels_get(struct net *net, unsigned int bits)
{
+ int v;
+
if (BIT_WORD(bits) >= NF_CT_LABELS_MAX_SIZE / sizeof(long))
return -ERANGE;
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used++;
- spin_unlock(&nf_connlabels_lock);
+ BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
+
+ v = atomic_inc_return_relaxed(&net->ct.labels_used);
+ WARN_ON_ONCE(v <= 0);
return 0;
}
@@ -76,27 +74,8 @@ EXPORT_SYMBOL_GPL(nf_connlabels_get);
void nf_connlabels_put(struct net *net)
{
- spin_lock(&nf_connlabels_lock);
- net->ct.labels_used--;
- spin_unlock(&nf_connlabels_lock);
-}
-EXPORT_SYMBOL_GPL(nf_connlabels_put);
-
-static const struct nf_ct_ext_type labels_extend = {
- .len = sizeof(struct nf_conn_labels),
- .align = __alignof__(struct nf_conn_labels),
- .id = NF_CT_EXT_LABELS,
-};
+ int v = atomic_dec_return_relaxed(&net->ct.labels_used);
-int nf_conntrack_labels_init(void)
-{
- BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX);
-
- spin_lock_init(&nf_connlabels_lock);
- return nf_ct_extend_register(&labels_extend);
-}
-
-void nf_conntrack_labels_fini(void)
-{
- nf_ct_extend_unregister(&labels_extend);
+ WARN_ON_ONCE(v < 0);
}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index bac5848f1c8e..55415f011943 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NetBIOS name service broadcast connection tracking helper
*
* (c) 2005 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
* This helper tracks locally originating NetBIOS name service
@@ -24,13 +20,14 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#define HELPER_NAME "netbios-ns"
#define NMBD_PORT 137
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_netbios_ns");
-MODULE_ALIAS_NFCT_HELPER("netbios_ns");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
static unsigned int timeout __read_mostly = 3;
module_param(timeout, uint, 0400);
@@ -48,7 +45,7 @@ static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
}
static struct nf_conntrack_helper helper __read_mostly = {
- .name = "netbios-ns",
+ .name = HELPER_NAME,
.tuple.src.l3num = NFPROTO_IPV4,
.tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
.tuple.dst.protonum = IPPROTO_UDP,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1213beb5a714..3a04665adf99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -29,6 +29,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/siphash.h>
#include <linux/netfilter.h>
#include <net/netlink.h>
@@ -45,15 +46,24 @@
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
-#ifdef CONFIG_NF_NAT_NEEDED
-#include <net/netfilter/nf_nat_core.h>
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
#endif
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
+#include "nf_internals.h"
+
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("List and change connection tracking table");
+
+struct ctnetlink_list_dump_ctx {
+ unsigned long last_id;
+ unsigned int cpu;
+ bool done;
+};
static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
@@ -62,7 +72,7 @@ static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
int ret = 0;
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO);
if (!nest_parms)
goto nla_put_failure;
if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum))
@@ -103,7 +113,7 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
int ret = 0;
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_IP);
if (!nest_parms)
goto nla_put_failure;
@@ -134,7 +144,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb,
ret = ctnetlink_dump_tuples_ip(skb, tuple);
if (ret >= 0) {
- l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
+ l4proto = nf_ct_l4proto_find(tuple->dst.protonum);
ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto);
}
rcu_read_unlock();
@@ -164,9 +174,18 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
+ bool skip_zero)
{
- long timeout = nf_ct_expires(ct) / HZ;
+ long timeout;
+
+ if (nf_ct_is_confirmed(ct))
+ timeout = nf_ct_expires(ct) / HZ;
+ else
+ timeout = ct->timeout / HZ;
+
+ if (skip_zero && timeout == 0)
+ return 0;
if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout)))
goto nla_put_failure;
@@ -176,21 +195,22 @@ nla_put_failure:
return -1;
}
-static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
+static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct,
+ bool destroy)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
int ret;
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
if (!l4proto->to_nlattr)
return 0;
- nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED);
+ nest_proto = nla_nest_start(skb, CTA_PROTOINFO);
if (!nest_proto)
goto nla_put_failure;
- ret = l4proto->to_nlattr(skb, nest_proto, ct);
+ ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy);
nla_nest_end(skb, nest_proto);
@@ -210,11 +230,12 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
if (!help)
return 0;
+ rcu_read_lock();
helper = rcu_dereference(help->helper);
if (!helper)
goto out;
- nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED);
+ nest_helper = nla_nest_start(skb, CTA_HELP);
if (!nest_helper)
goto nla_put_failure;
if (nla_put_string(skb, CTA_HELP_NAME, helper->name))
@@ -225,9 +246,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
nla_nest_end(skb, nest_helper);
out:
+ rcu_read_unlock();
return 0;
nla_put_failure:
+ rcu_read_unlock();
return -1;
}
@@ -248,7 +271,7 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct,
bytes = atomic64_read(&counter[dir].bytes);
}
- nest_count = nla_nest_start(skb, attr | NLA_F_NESTED);
+ nest_count = nla_nest_start(skb, attr);
if (!nest_count)
goto nla_put_failure;
@@ -292,7 +315,7 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
if (!tstamp)
return 0;
- nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
+ nest_count = nla_nest_start(skb, CTA_TIMESTAMP);
if (!nest_count)
goto nla_put_failure;
@@ -311,9 +334,15 @@ nla_put_failure:
}
#ifdef CONFIG_NF_CONNTRACK_MARK
-static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
+static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct,
+ bool dump)
{
- if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark)))
+ u32 mark = READ_ONCE(ct->mark);
+
+ if (!mark && !dump)
+ return 0;
+
+ if (nla_put_be32(skb, CTA_MARK, htonl(mark)))
goto nla_put_failure;
return 0;
@@ -321,39 +350,56 @@ nla_put_failure:
return -1;
}
#else
-#define ctnetlink_dump_mark(a, b) (0)
+#define ctnetlink_dump_mark(a, b, c) (0)
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
- int len, ret;
- char *secctx;
+ struct lsm_context ctx;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return 0;
ret = -1;
- nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+ nest_secctx = nla_nest_start(skb, CTA_SECCTX);
if (!nest_secctx)
goto nla_put_failure;
- if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context))
goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
nla_put_failure:
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
return ret;
}
#else
#define ctnetlink_dump_secctx(a, b) (0)
#endif
-#ifdef CONFIG_NF_CONNTRACK_LABELS
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct);
+
+ if (e) {
+ u64 ts = local64_read(&e->timestamp);
+
+ if (ts)
+ return nla_put_be64(skb, CTA_TIMESTAMP_EVENT,
+ cpu_to_be64(ts), CTA_TIMESTAMP_PAD);
+ }
+#endif
+ return 0;
+}
+
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -362,6 +408,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
return 0;
return nla_total_size(sizeof(labels->bits));
}
+#endif
static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
@@ -382,10 +429,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
return 0;
}
-#else
-#define ctnetlink_dump_labels(a, b) (0)
-#define ctnetlink_label_size(a) (0)
-#endif
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -396,7 +439,7 @@ static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct)
if (!(ct->status & IPS_EXPECTED))
return 0;
- nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0)
@@ -414,7 +457,7 @@ dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
{
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, type);
if (!nest_parms)
goto nla_put_failure;
@@ -466,7 +509,7 @@ static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct)
if (!synproxy)
return 0;
- nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_SYNPROXY);
if (!nest_parms)
goto nla_put_failure;
@@ -485,7 +528,9 @@ nla_put_failure:
static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
+ __be32 id = (__force __be32)nf_ct_get_id(ct);
+
+ if (nla_put_be32(skb, CTA_ID, id))
goto nla_put_failure;
return 0;
@@ -495,7 +540,7 @@ nla_put_failure:
static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct)
{
- if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))))
+ if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use))))
goto nla_put_failure;
return 0;
@@ -503,29 +548,62 @@ nla_put_failure:
return -1;
}
+/* all these functions access ct->ext. Caller must either hold a reference
+ * on ct or prevent its deletion by holding either the bucket spinlock or
+ * pcpu dying list lock.
+ */
+static int ctnetlink_dump_extinfo(struct sk_buff *skb,
+ struct nf_conn *ct, u32 type)
+{
+ if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_helpinfo(skb, ct) < 0 ||
+ ctnetlink_dump_labels(skb, ct) < 0 ||
+ ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
+ ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct)
+{
+ if (ctnetlink_dump_status(skb, ct) < 0 ||
+ ctnetlink_dump_mark(skb, ct, true) < 0 ||
+ ctnetlink_dump_secctx(skb, ct) < 0 ||
+ ctnetlink_dump_id(skb, ct) < 0 ||
+ ctnetlink_dump_use(skb, ct) < 0 ||
+ ctnetlink_dump_master(skb, ct) < 0)
+ return -1;
+
+ if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) &&
+ (ctnetlink_dump_timeout(skb, ct, false) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0))
+ return -1;
+
+ return 0;
+}
+
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool extinfo, unsigned int flags)
{
const struct nf_conntrack_zone *zone;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
- unsigned int flags = portid ? NLM_F_MULTI : 0, event;
+ unsigned int event;
+ if (portid)
+ flags |= NLM_F_MULTI;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
@@ -535,7 +613,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
@@ -549,20 +627,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
NF_CT_DEFAULT_ZONE_DIR) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_status(skb, ct) < 0 ||
- ctnetlink_dump_timeout(skb, ct) < 0 ||
- ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0 ||
- ctnetlink_dump_protoinfo(skb, ct) < 0 ||
- ctnetlink_dump_helpinfo(skb, ct) < 0 ||
- ctnetlink_dump_mark(skb, ct) < 0 ||
- ctnetlink_dump_secctx(skb, ct) < 0 ||
- ctnetlink_dump_labels(skb, ct) < 0 ||
- ctnetlink_dump_id(skb, ct) < 0 ||
- ctnetlink_dump_use(skb, ct) < 0 ||
- ctnetlink_dump_master(skb, ct) < 0 ||
- ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
- ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+ if (ctnetlink_dump_info(skb, ct) < 0)
+ goto nla_put_failure;
+ if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0)
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -590,7 +657,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
len *= 3u; /* ORIG, REPLY, MASTER */
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
len += l4proto->nlattr_size;
if (l4proto->nlattr_tuple_size) {
len4 = l4proto->nlattr_tuple_size();
@@ -599,7 +666,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
return len + len4;
}
-#endif
static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
@@ -614,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- int len, ret;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, NULL, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, NULL);
+ if (ret < 0)
return 0;
return nla_total_size(0) /* CTA_SECCTX */
- + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */
#else
return 0;
#endif
@@ -637,6 +703,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
return 0;
#endif
}
+#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
@@ -655,7 +722,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
@@ -667,16 +734,18 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */
+#endif
;
}
static int
-ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
{
const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
struct nf_conn *ct = item->ct;
struct sk_buff *skb;
@@ -706,18 +775,14 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct),
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = nf_ct_l3num(ct);
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
zone = nf_ct_zone(ct);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
@@ -727,7 +792,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
@@ -748,15 +813,19 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
goto nla_put_failure;
if (events & (1 << IPCT_DESTROY)) {
+ if (ctnetlink_dump_timeout(skb, ct, true) < 0)
+ goto nla_put_failure;
+
if (ctnetlink_dump_acct(skb, ct, type) < 0 ||
- ctnetlink_dump_timestamp(skb, ct) < 0)
+ ctnetlink_dump_timestamp(skb, ct) < 0 ||
+ ctnetlink_dump_protoinfo(skb, ct, true) < 0)
goto nla_put_failure;
} else {
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
goto nla_put_failure;
- if (events & (1 << IPCT_PROTOINFO)
- && ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (events & (1 << IPCT_PROTOINFO) &&
+ ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
@@ -786,10 +855,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((events & (1 << IPCT_MARK) || ct->mark)
- && ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK)))
goto nla_put_failure;
#endif
+
+ if (ctnetlink_dump_event_timestamp(skb, ct))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
@@ -812,27 +884,117 @@ errout:
static int ctnetlink_done(struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
kfree(cb->data);
return 0;
}
+struct ctnetlink_filter_u32 {
+ u32 val;
+ u32 mask;
+};
+
struct ctnetlink_filter {
u8 family;
- struct {
- u_int32_t val;
- u_int32_t mask;
- } mark;
+ bool zone_filter;
+
+ u_int32_t orig_flags;
+ u_int32_t reply_flags;
+
+ struct nf_conntrack_tuple orig;
+ struct nf_conntrack_tuple reply;
+ struct nf_conntrack_zone zone;
+
+ struct ctnetlink_filter_u32 mark;
+ struct ctnetlink_filter_u32 status;
+};
+
+static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = {
+ [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 },
+ [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 },
};
+static int ctnetlink_parse_filter(const struct nlattr *attr,
+ struct ctnetlink_filter *filter)
+{
+ struct nlattr *tb[CTA_FILTER_MAX + 1];
+ int ret = 0;
+
+ ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy,
+ NULL);
+ if (ret)
+ return ret;
+
+ if (tb[CTA_FILTER_ORIG_FLAGS]) {
+ filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]);
+ if (filter->orig_flags & ~CTA_FILTER_F_ALL)
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[CTA_FILTER_REPLY_FLAGS]) {
+ filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]);
+ if (filter->reply_flags & ~CTA_FILTER_F_ALL)
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int ctnetlink_parse_zone(const struct nlattr *attr,
+ struct nf_conntrack_zone *zone);
+static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple,
+ u32 type, u_int8_t l3num,
+ struct nf_conntrack_zone *zone,
+ u_int32_t flags);
+
+static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark,
+ const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ if (cda[CTA_MARK]) {
+ mark->val = ntohl(nla_get_be32(cda[CTA_MARK]));
+
+ if (cda[CTA_MARK_MASK])
+ mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ else
+ mark->mask = 0xffffffff;
+ } else if (cda[CTA_MARK_MASK]) {
+ return -EINVAL;
+ }
+#endif
+ return 0;
+}
+
+static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status,
+ const struct nlattr * const cda[])
+{
+ if (cda[CTA_STATUS]) {
+ status->val = ntohl(nla_get_be32(cda[CTA_STATUS]));
+ if (cda[CTA_STATUS_MASK])
+ status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK]));
+ else
+ status->mask = status->val;
+
+ /* status->val == 0? always true, else always false. */
+ if (status->mask == 0)
+ return -EINVAL;
+ } else if (cda[CTA_STATUS_MASK]) {
+ return -EINVAL;
+ }
+
+ /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */
+ BUILD_BUG_ON(__IPS_MAX_BIT >= 32);
+ return 0;
+}
+
static struct ctnetlink_filter *
ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
{
struct ctnetlink_filter *filter;
+ int err;
#ifndef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
+ if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
return ERR_PTR(-EOPNOTSUPP);
#endif
@@ -842,13 +1004,69 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
filter->family = family;
-#ifdef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) {
- filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK]));
- filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
+ err = ctnetlink_filter_parse_mark(&filter->mark, cda);
+ if (err)
+ goto err_filter;
+
+ err = ctnetlink_filter_parse_status(&filter->status, cda);
+ if (err)
+ goto err_filter;
+
+ if (cda[CTA_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone);
+ if (err < 0)
+ goto err_filter;
+ filter->zone_filter = true;
}
-#endif
+
+ if (!cda[CTA_FILTER])
+ return filter;
+
+ err = ctnetlink_parse_filter(cda[CTA_FILTER], filter);
+ if (err < 0)
+ goto err_filter;
+
+ if (filter->orig_flags) {
+ if (!cda[CTA_TUPLE_ORIG]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
+
+ err = ctnetlink_parse_tuple_filter(cda, &filter->orig,
+ CTA_TUPLE_ORIG,
+ filter->family,
+ &filter->zone,
+ filter->orig_flags);
+ if (err < 0)
+ goto err_filter;
+ }
+
+ if (filter->reply_flags) {
+ if (!cda[CTA_TUPLE_REPLY]) {
+ err = -EINVAL;
+ goto err_filter;
+ }
+
+ err = ctnetlink_parse_tuple_filter(cda, &filter->reply,
+ CTA_TUPLE_REPLY,
+ filter->family,
+ &filter->zone,
+ filter->reply_flags);
+ if (err < 0)
+ goto err_filter;
+ }
+
return filter;
+
+err_filter:
+ kfree(filter);
+
+ return ERR_PTR(err);
+}
+
+static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda)
+{
+ return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE];
}
static int ctnetlink_start(struct netlink_callback *cb)
@@ -858,7 +1076,7 @@ static int ctnetlink_start(struct netlink_callback *cb)
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u8 family = nfmsg->nfgen_family;
- if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+ if (ctnetlink_needs_filter(family, cda)) {
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
@@ -868,9 +1086,80 @@ static int ctnetlink_start(struct netlink_callback *cb)
return 0;
}
+static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple,
+ struct nf_conntrack_tuple *ct_tuple,
+ u_int32_t flags, int family)
+{
+ switch (family) {
+ case NFPROTO_IPV4:
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+ filter_tuple->src.u3.ip != ct_tuple->src.u3.ip)
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+ filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip)
+ return 0;
+ break;
+ case NFPROTO_IPV6:
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) &&
+ !ipv6_addr_cmp(&filter_tuple->src.u3.in6,
+ &ct_tuple->src.u3.in6))
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) &&
+ !ipv6_addr_cmp(&filter_tuple->dst.u3.in6,
+ &ct_tuple->dst.u3.in6))
+ return 0;
+ break;
+ }
+
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) &&
+ filter_tuple->dst.protonum != ct_tuple->dst.protonum)
+ return 0;
+
+ switch (ct_tuple->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) &&
+ filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port)
+ return 0;
+
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) &&
+ filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port)
+ return 0;
+ break;
+ case IPPROTO_ICMP:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) &&
+ filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) &&
+ filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) &&
+ filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+ return 0;
+ break;
+ case IPPROTO_ICMPV6:
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) &&
+ filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) &&
+ filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code)
+ return 0;
+ if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) &&
+ filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id)
+ return 0;
+ break;
+ }
+
+ return 1;
+}
+
static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
{
struct ctnetlink_filter *filter = data;
+ struct nf_conntrack_tuple *tuple;
+ u32 status;
if (filter == NULL)
goto out;
@@ -882,10 +1171,33 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data)
if (filter->family && nf_ct_l3num(ct) != filter->family)
goto ignore_entry;
+ if (filter->zone_filter &&
+ !nf_ct_zone_equal_any(ct, &filter->zone))
+ goto ignore_entry;
+
+ if (filter->orig_flags) {
+ tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL);
+ if (!ctnetlink_filter_match_tuple(&filter->orig, tuple,
+ filter->orig_flags,
+ filter->family))
+ goto ignore_entry;
+ }
+
+ if (filter->reply_flags) {
+ tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY);
+ if (!ctnetlink_filter_match_tuple(&filter->reply, tuple,
+ filter->reply_flags,
+ filter->family))
+ goto ignore_entry;
+ }
+
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((ct->mark & filter->mark.mask) != filter->mark.val)
+ if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val)
goto ignore_entry;
#endif
+ status = (u32)READ_ONCE(ct->status);
+ if ((status & filter->status.mask) != filter->status.val)
+ goto ignore_entry;
out:
return 1;
@@ -894,18 +1206,26 @@ ignore_entry:
return 0;
}
+static unsigned long ctnetlink_get_id(const struct nf_conn *ct)
+{
+ unsigned long id = nf_ct_get_id(ct);
+
+ return id ? id : 1;
+}
+
static int
ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
+ unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0;
struct net *net = sock_net(skb->sk);
- struct nf_conn *ct, *last;
+ unsigned long last_id = cb->args[1];
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
struct nf_conn *nf_ct_evict[8];
+ struct nf_conn *ct;
int res, i;
spinlock_t *lockp;
- last = (struct nf_conn *)cb->args[1];
i = 0;
local_bh_disable();
@@ -926,12 +1246,11 @@ restart:
}
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
- continue;
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
+ /* need to defer nf_ct_kill() until lock is released */
if (i < ARRAY_SIZE(nf_ct_evict) &&
- atomic_inc_not_zero(&ct->ct_general.use))
+ refcount_inc_not_zero(&ct->ct_general.use))
nf_ct_evict[i++] = ct;
continue;
}
@@ -939,24 +1258,24 @@ restart:
if (!net_eq(net, nf_ct_net(ct)))
continue;
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+
if (cb->args[1]) {
- if (ct != last)
+ if (ctnetlink_get_id(ct) != last_id)
continue;
cb->args[1] = 0;
}
if (!ctnetlink_filter_match(ct, cb->data))
continue;
- rcu_read_lock();
res =
ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct);
- rcu_read_unlock();
+ ct, true, flags);
if (res < 0) {
- nf_conntrack_get(&ct->ct_general);
- cb->args[1] = (unsigned long)ct;
+ cb->args[1] = ctnetlink_get_id(ct);
spin_unlock(lockp);
goto out;
}
@@ -969,12 +1288,10 @@ restart:
}
out:
local_bh_enable();
- if (last) {
+ if (last_id) {
/* nf ct hash resize happened, now clear the leftover. */
- if ((struct nf_conn *)cb->args[1] == last)
+ if (cb->args[1] == last_id)
cb->args[1] = 0;
-
- nf_ct_put(last);
}
while (i) {
@@ -988,50 +1305,65 @@ out:
}
static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_IP_V4_SRC])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+ if (!tb[CTA_IP_V4_DST])
+ return -EINVAL;
- t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+ }
return 0;
}
static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
+ struct nf_conntrack_tuple *t,
+ u_int32_t flags)
{
- if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_IP_V6_SRC])
+ return -EINVAL;
+
+ t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) {
+ if (!tb[CTA_IP_V6_DST])
+ return -EINVAL;
- t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
- t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+ t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+ }
return 0;
}
static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
struct nlattr *tb[CTA_IP_MAX+1];
int ret = 0;
- ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL);
+ ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr,
+ cta_ip_nla_policy, NULL);
if (ret < 0)
return ret;
- ret = nla_validate_nested(attr, CTA_IP_MAX,
- cta_ip_nla_policy, NULL);
- if (ret)
- return ret;
-
switch (tuple->src.l3num) {
case NFPROTO_IPV4:
- ret = ipv4_nlattr_to_tuple(tb, tuple);
+ ret = ipv4_nlattr_to_tuple(tb, tuple, flags);
break;
case NFPROTO_IPV6:
- ret = ipv6_nlattr_to_tuple(tb, tuple);
+ ret = ipv6_nlattr_to_tuple(tb, tuple, flags);
break;
}
@@ -1043,29 +1375,35 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
};
static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
const struct nf_conntrack_l4proto *l4proto;
struct nlattr *tb[CTA_PROTO_MAX+1];
int ret = 0;
- ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr,
+ proto_nla_policy, NULL);
if (ret < 0)
return ret;
+ if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)))
+ return 0;
+
if (!tb[CTA_PROTO_NUM])
return -EINVAL;
+
tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]);
rcu_read_lock();
- l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
+ l4proto = nf_ct_l4proto_find(tuple->dst.protonum);
if (likely(l4proto->nlattr_to_tuple)) {
- ret = nla_validate_nested(attr, CTA_PROTO_MAX,
- l4proto->nla_policy, NULL);
+ ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX,
+ l4proto->nla_policy,
+ NULL);
if (ret == 0)
- ret = l4proto->nlattr_to_tuple(tb, tuple);
+ ret = l4proto->nlattr_to_tuple(tb, tuple, flags);
}
rcu_read_unlock();
@@ -1116,38 +1454,59 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
[CTA_TUPLE_ZONE] = { .type = NLA_U16 },
};
+#define CTA_FILTER_F_ALL_CTA_PROTO \
+ (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \
+ CTA_FILTER_F_CTA_PROTO_DST_PORT | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \
+ CTA_FILTER_F_CTA_PROTO_ICMP_ID | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \
+ CTA_FILTER_F_CTA_PROTO_ICMPV6_ID)
+
static int
-ctnetlink_parse_tuple(const struct nlattr * const cda[],
- struct nf_conntrack_tuple *tuple, u32 type,
- u_int8_t l3num, struct nf_conntrack_zone *zone)
+ctnetlink_parse_tuple_filter(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone,
+ u_int32_t flags)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
memset(tuple, 0, sizeof(*tuple));
- err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type],
+ tuple_nla_policy, NULL);
if (err < 0)
return err;
- if (!tb[CTA_TUPLE_IP])
- return -EINVAL;
-
+ if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6)
+ return -EOPNOTSUPP;
tuple->src.l3num = l3num;
- err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
- if (err < 0)
- return err;
+ if (flags & CTA_FILTER_FLAG(CTA_IP_DST) ||
+ flags & CTA_FILTER_FLAG(CTA_IP_SRC)) {
+ if (!tb[CTA_TUPLE_IP])
+ return -EINVAL;
- if (!tb[CTA_TUPLE_PROTO])
- return -EINVAL;
+ err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags);
+ if (err < 0)
+ return err;
+ }
- err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple);
- if (err < 0)
- return err;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) {
+ if (!tb[CTA_TUPLE_PROTO])
+ return -EINVAL;
- if (tb[CTA_TUPLE_ZONE]) {
+ err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags);
+ if (err < 0)
+ return err;
+ } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) {
+ /* Can't manage proto flags without a protonum */
+ return -EINVAL;
+ }
+
+ if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) {
if (!zone)
return -EINVAL;
@@ -1166,6 +1525,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[],
return 0;
}
+static int
+ctnetlink_parse_tuple(const struct nlattr * const cda[],
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone)
+{
+ return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone,
+ CTA_FILTER_FLAG(ALL));
+}
+
static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = {
[CTA_HELP_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN - 1 },
@@ -1177,7 +1545,8 @@ static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name,
int err;
struct nlattr *tb[CTA_HELP_MAX+1];
- err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr,
+ help_nla_policy, NULL);
if (err < 0)
return err;
@@ -1212,13 +1581,13 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
.len = NF_CT_LABELS_MAX_SIZE },
[CTA_LABELS_MASK] = { .type = NLA_BINARY,
.len = NF_CT_LABELS_MAX_SIZE },
+ [CTA_FILTER] = { .type = NLA_NESTED },
+ [CTA_STATUS_MASK] = { .type = NLA_U32 },
+ [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT },
};
static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
{
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
- return 0;
-
return ctnetlink_filter_match(ct, data);
}
@@ -1227,94 +1596,92 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
+ struct nf_ct_iter_data iter = {
+ .net = net,
+ .portid = portid,
+ .report = report,
+ };
- if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) {
+ if (ctnetlink_needs_filter(family, cda)) {
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
+
+ iter.data = filter;
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
+ nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
}
-static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u8 family = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
if (err < 0)
return err;
- if (cda[CTA_TUPLE_ORIG])
+ if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG,
- u3, &zone);
- else if (cda[CTA_TUPLE_REPLY])
+ family, &zone);
+ else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER])
err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY,
- u3, &zone);
+ family, &zone);
else {
- return ctnetlink_flush_conntrack(net, cda,
+ u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC;
+
+ return ctnetlink_flush_conntrack(info->net, cda,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh), u3);
+ nlmsg_report(info->nlh), u3);
}
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
- nf_ct_put(ct);
- return -EBUSY;
- }
-
if (cda[CTA_ID]) {
- u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
- if (id != (u32)(unsigned long)ct) {
+ __be32 id = nla_get_be32(cda[CTA_ID]);
+
+ if (id != (__force __be32)nf_ct_get_id(ct)) {
nf_ct_put(ct);
return -ENOENT;
}
}
- nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh));
+ nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh));
nf_ct_put(ct);
return 0;
}
-static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
- struct nf_conn *ct;
- struct sk_buff *skb2 = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
+ struct nf_conn *ct;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = ctnetlink_start,
.dump = ctnetlink_dump_table,
@@ -1322,7 +1689,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
.data = (void *)cda,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -1341,166 +1708,157 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- h = nf_conntrack_find_get(net, &zone, &tuple);
+ h = nf_conntrack_find_get(info->net, &zone, &tuple);
if (!h)
return -ENOENT;
ct = nf_ct_tuplehash_to_ctrack(h);
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_put(ct);
return -ENOMEM;
}
- rcu_read_lock();
- err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type), ct);
- rcu_read_unlock();
+ err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct,
+ true, 0);
nf_ct_put(ct);
- if (err <= 0)
- goto free;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+}
- return 0;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int ctnetlink_dump_one_entry(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nf_conn *ct,
+ bool dying)
+{
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ u8 l3proto = nfmsg->nfgen_family;
+ int res;
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ if (l3proto && nf_ct_l3num(ct) != l3proto)
+ return 0;
+
+ if (ctx->last_id) {
+ if (ctnetlink_get_id(ct) != ctx->last_id)
+ return 0;
+
+ ctx->last_id = 0;
+ }
+
+ /* We can't dump extension info for the unconfirmed
+ * list because unconfirmed conntracks can have
+ * ct->ext reallocated (and thus freed).
+ *
+ * In the dying list case ct->ext can't be free'd
+ * until after we drop pcpu->lock.
+ */
+ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
+ ct, dying, 0);
+ if (res < 0)
+ ctx->last_id = ctnetlink_get_id(ct);
+
+ return res;
}
+#endif
-static int ctnetlink_done_list(struct netlink_callback *cb)
+static int
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
{
- if (cb->args[1])
- nf_ct_put((struct nf_conn *)cb->args[1]);
return 0;
}
static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conn *ct, *last;
+ struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ const struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_net_ecache *ecache_net;
+ unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- u_int8_t l3proto = nfmsg->nfgen_family;
- int res;
- int cpu;
- struct hlist_nulls_head *list;
- struct net *net = sock_net(skb->sk);
+#endif
- if (cb->args[2])
+ if (ctx->done)
return 0;
- last = (struct nf_conn *)cb->args[1];
+ ctx->last_id = 0;
- for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) {
- struct ct_pcpu *pcpu;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ ecache_net = nf_conn_pernet_ecache(net);
+ spin_lock_bh(&ecache_net->dying_lock);
- if (!cpu_possible(cpu))
+ hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
+ struct nf_conn *ct;
+ int res;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (last_id && last_id != ctnetlink_get_id(ct))
continue;
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- spin_lock_bh(&pcpu->lock);
- list = dying ? &pcpu->dying : &pcpu->unconfirmed;
-restart:
- hlist_nulls_for_each_entry(h, n, list, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
- if (l3proto && nf_ct_l3num(ct) != l3proto)
- continue;
- if (cb->args[1]) {
- if (ct != last)
- continue;
- cb->args[1] = 0;
- }
- rcu_read_lock();
- res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
- ct);
- rcu_read_unlock();
- if (res < 0) {
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
- cb->args[0] = cpu;
- cb->args[1] = (unsigned long)ct;
- spin_unlock_bh(&pcpu->lock);
- goto out;
- }
- }
- if (cb->args[1]) {
- cb->args[1] = 0;
- goto restart;
+ res = ctnetlink_dump_one_entry(skb, cb, ct, true);
+ if (res < 0) {
+ spin_unlock_bh(&ecache_net->dying_lock);
+ return skb->len;
}
- spin_unlock_bh(&pcpu->lock);
+
+ last_id = 0;
}
- cb->args[2] = 1;
-out:
- if (last)
- nf_ct_put(last);
- return skb->len;
-}
+ spin_unlock_bh(&ecache_net->dying_lock);
+#endif
+ ctx->done = true;
-static int
-ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, true);
+ return skb->len;
}
-static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_dying(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
- .done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
}
-static int
-ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, false);
-}
-
-static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
- .done = ctnetlink_done_list,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return -EOPNOTSUPP;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
+ __must_hold(RCU)
{
- struct nf_nat_hook *nat_hook;
+ const struct nf_nat_hook *nat_hook;
int err;
nat_hook = rcu_dereference(nf_nat_hook);
@@ -1542,51 +1900,16 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
-static void
-__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
- unsigned long off)
-{
- unsigned int bit;
-
- /* Ignore these unchangable bits */
- on &= ~IPS_UNCHANGEABLE_MASK;
- off &= ~IPS_UNCHANGEABLE_MASK;
-
- for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
- if (on & (1 << bit))
- set_bit(bit, &ct->status);
- else if (off & (1 << bit))
- clear_bit(bit, &ct->status);
- }
-}
-
static int
ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
{
- unsigned long d;
- unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS]));
- d = ct->status ^ status;
-
- if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
- /* unchangeable */
- return -EBUSY;
-
- if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
- /* SEEN_REPLY bit can only be set */
- return -EBUSY;
-
- if (d & IPS_ASSURED && !(status & IPS_ASSURED))
- /* ASSURED bit can only be set */
- return -EBUSY;
-
- __ctnetlink_change_status(ct, status, 0);
- return 0;
+ return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS])));
}
static int
ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
{
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
int ret;
if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
@@ -1656,7 +1979,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
}
if (help) {
- if (help->helper == helper) {
+ if (rcu_access_pointer(help->helper) == helper) {
/* update private helper data if allowed. */
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
@@ -1675,16 +1998,7 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
static int ctnetlink_change_timeout(struct nf_conn *ct,
const struct nlattr * const cda[])
{
- u64 timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = nfct_time_stamp + (u32)timeout;
-
- if (test_bit(IPS_DYING_BIT, &ct->status))
- return -ETIME;
-
- return 0;
+ return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ);
}
#if defined(CONFIG_NF_CONNTRACK_MARK)
@@ -1697,15 +2011,14 @@ static void ctnetlink_change_mark(struct nf_conn *ct,
mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK]));
mark = ntohl(nla_get_be32(cda[CTA_MARK]));
- newmark = (ct->mark & mask) ^ mark;
- if (newmark != ct->mark)
- ct->mark = newmark;
+ newmark = (READ_ONCE(ct->mark) & mask) ^ mark;
+ if (newmark != READ_ONCE(ct->mark))
+ WRITE_ONCE(ct->mark, newmark);
}
#endif
static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
[CTA_PROTOINFO_TCP] = { .type = NLA_NESTED },
- [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED },
[CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED },
};
@@ -1717,16 +2030,14 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct,
struct nlattr *tb[CTA_PROTOINFO_MAX+1];
int err = 0;
- err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr,
+ protoinfo_policy, NULL);
if (err < 0)
return err;
- rcu_read_lock();
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
+ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
if (l4proto->from_nlattr)
err = l4proto->from_nlattr(tb, ct);
- rcu_read_unlock();
return err;
}
@@ -1743,7 +2054,8 @@ static int change_seq_adj(struct nf_ct_seqadj *seq,
int err;
struct nlattr *cda[CTA_SEQADJ_MAX+1];
- err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL);
+ err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr,
+ seqadj_policy, NULL);
if (err < 0)
return err;
@@ -1820,8 +2132,9 @@ static int ctnetlink_change_synproxy(struct nf_conn *ct,
if (!synproxy)
return 0;
- err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY],
- synproxy_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX,
+ cda[CTA_SYNPROXY], synproxy_policy,
+ NULL);
if (err < 0)
return err;
@@ -1943,11 +2256,6 @@ ctnetlink_create_conntrack(struct net *net,
if (!cda[CTA_TIMEOUT])
goto err1;
- timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
- if (timeout > INT_MAX)
- timeout = INT_MAX;
- ct->timeout = (u32)timeout + nfct_time_stamp;
-
rcu_read_lock();
if (cda[CTA_HELP]) {
char *helpname = NULL;
@@ -1991,14 +2299,10 @@ ctnetlink_create_conntrack(struct net *net,
if (helper->from_nlattr)
helper->from_nlattr(helpinfo, ct);
- /* not in hash table yet so not strictly necessary */
+ /* disable helper auto-assignment for this entry */
+ ct->status |= IPS_HELPER;
RCU_INIT_POINTER(help->helper, helper);
}
- } else {
- /* try an implicit helper assignation */
- err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
- if (err < 0)
- goto err2;
}
err = ctnetlink_setup_nat(ct, cda);
@@ -2015,6 +2319,9 @@ ctnetlink_create_conntrack(struct net *net,
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
+ timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+ __nf_ct_set_timeout(ct, timeout);
+
if (cda[CTA_STATUS]) {
err = ctnetlink_change_status(ct, cda);
if (err < 0)
@@ -2071,12 +2378,15 @@ ctnetlink_create_conntrack(struct net *net,
err = nf_conntrack_hash_check_insert(ct);
if (err < 0)
- goto err2;
+ goto err3;
rcu_read_unlock();
return ct;
+err3:
+ if (ct->master)
+ nf_ct_put(ct->master);
err2:
rcu_read_unlock();
err1:
@@ -2084,18 +2394,15 @@ err1:
return ERR_PTR(err);
}
-static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_conntrack(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- struct nf_conn *ct;
- u_int8_t u3 = nfmsg->nfgen_family;
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
int err;
err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone);
@@ -2117,13 +2424,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
}
if (cda[CTA_TUPLE_ORIG])
- h = nf_conntrack_find_get(net, &zone, &otuple);
+ h = nf_conntrack_find_get(info->net, &zone, &otuple);
else if (cda[CTA_TUPLE_REPLY])
- h = nf_conntrack_find_get(net, &zone, &rtuple);
+ h = nf_conntrack_find_get(info->net, &zone, &rtuple);
if (h == NULL) {
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
enum ip_conntrack_events events;
if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY])
@@ -2131,8 +2438,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
if (otuple.dst.protonum != rtuple.dst.protonum)
return -EINVAL;
- ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple,
- &rtuple, u3);
+ ct = ctnetlink_create_conntrack(info->net, &zone, cda,
+ &otuple, &rtuple, u3);
if (IS_ERR(ct))
return PTR_ERR(ct);
@@ -2155,7 +2462,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_SYNPROXY) |
events,
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_put(ct);
}
@@ -2165,7 +2472,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
err = -EEXIST;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) {
err = ctnetlink_change_conntrack(ct, cda);
if (err == 0) {
nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
@@ -2177,7 +2484,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
(1 << IPCT_MARK) |
(1 << IPCT_SYNPROXY),
ct, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
}
@@ -2190,23 +2497,17 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
__u16 cpu, const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_CT_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) ||
nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) ||
- nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) ||
nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) ||
nla_put_be32(skb, CTA_STATS_INSERT_FAILED,
htonl(st->insert_failed)) ||
@@ -2214,7 +2515,11 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) ||
nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) ||
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
- htonl(st->search_restart)))
+ htonl(st->search_restart)) ||
+ nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
+ htonl(st->clash_resolve)) ||
+ nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
+ htonl(st->chaintoolong)))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -2253,17 +2558,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_ct_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -2273,21 +2576,17 @@ static int
ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
struct net *net)
{
- struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
+ unsigned int nr_conntracks;
+ struct nlmsghdr *nlh;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
+ nr_conntracks = nf_conntrack_count(net);
if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
goto nla_put_failure;
@@ -2303,10 +2602,8 @@ nlmsg_failure:
return -1;
}
-static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
struct sk_buff *skb2;
int err;
@@ -2316,23 +2613,15 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
return -ENOMEM;
err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
sock_net(skb->sk));
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
@@ -2371,7 +2660,9 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
+ nla_total_size(0) /* CTA_HELP */
+ nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
+ ctnetlink_secctx_size(ct)
-#ifdef CONFIG_NF_NAT_NEEDED
+ + ctnetlink_acct_size(ct)
+ + ctnetlink_timestamp_size(ct)
+#if IS_ENABLED(CONFIG_NF_NAT)
+ 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
+ 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
#endif
@@ -2385,12 +2676,6 @@ ctnetlink_glue_build_size(const struct nf_conn *ct)
;
}
-static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb,
- enum ip_conntrack_info *ctinfo)
-{
- return nf_ct_get(skb, ctinfo);
-}
-
static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
{
const struct nf_conntrack_zone *zone;
@@ -2398,7 +2683,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
zone = nf_ct_zone(ct);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0)
@@ -2408,7 +2693,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
nla_nest_end(skb, nest_parms);
- nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0)
@@ -2428,10 +2713,14 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_timeout(skb, ct) < 0)
+ if (ctnetlink_dump_timeout(skb, ct, false) < 0)
+ goto nla_put_failure;
+
+ if (ctnetlink_dump_protoinfo(skb, ct, false) < 0)
goto nla_put_failure;
- if (ctnetlink_dump_protoinfo(skb, ct) < 0)
+ if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 ||
+ ctnetlink_dump_timestamp(skb, ct) < 0)
goto nla_put_failure;
if (ctnetlink_dump_helpinfo(skb, ct) < 0)
@@ -2452,7 +2741,7 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_MARK
- if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
+ if (ctnetlink_dump_mark(skb, ct, true) < 0)
goto nla_put_failure;
#endif
if (ctnetlink_dump_labels(skb, ct) < 0)
@@ -2470,7 +2759,7 @@ ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct,
{
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, ct_attr | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, ct_attr);
if (!nest_parms)
goto nla_put_failure;
@@ -2508,7 +2797,7 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
* unchangeable bits but do not error out. Also user programs
* are allowed to clear the bits that they are allowed to change.
*/
- __ctnetlink_change_status(ct, status, ~status);
+ __nf_ct_change_status(ct, status, ~status);
return 0;
}
@@ -2551,7 +2840,8 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct)
struct nlattr *cda[CTA_MAX+1];
int ret;
- ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy, NULL);
+ ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy,
+ NULL);
if (ret < 0)
return ret;
@@ -2584,8 +2874,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
struct nf_conntrack_expect *exp;
int err;
- err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy,
- NULL);
+ err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr,
+ exp_nla_policy, NULL);
if (err < 0)
return err;
@@ -2608,7 +2898,7 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
if (IS_ERR(exp))
return PTR_ERR(exp);
- err = nf_ct_expect_related_report(exp, portid, report);
+ err = nf_ct_expect_related_report(exp, portid, report, 0);
nf_ct_expect_put(exp);
return err;
}
@@ -2622,8 +2912,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
}
-static struct nfnl_ct_hook ctnetlink_glue_hook = {
- .get_ct = ctnetlink_glue_get_ct,
+static const struct nfnl_ct_hook ctnetlink_glue_hook = {
.build_size = ctnetlink_glue_build_size,
.build = ctnetlink_glue_build,
.parse = ctnetlink_glue_parse,
@@ -2642,7 +2931,7 @@ static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
{
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, type | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, type);
if (!nest_parms)
goto nla_put_failure;
if (ctnetlink_dump_tuples(skb, tuple) < 0)
@@ -2667,17 +2956,18 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
memset(&m, 0xFF, sizeof(m));
memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
m.src.u.all = mask->src.u.all;
+ m.src.l3num = tuple->src.l3num;
m.dst.protonum = tuple->dst.protonum;
- nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK);
if (!nest_parms)
goto nla_put_failure;
rcu_read_lock();
ret = ctnetlink_dump_tuples_ip(skb, &m);
if (ret >= 0) {
- l4proto = __nf_ct_l4proto_find(tuple->dst.protonum);
- ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
+ l4proto = nf_ct_l4proto_find(tuple->dst.protonum);
+ ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto);
}
rcu_read_unlock();
@@ -2692,7 +2982,28 @@ nla_put_failure:
return -1;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const union nf_inet_addr any_addr;
+#endif
+
+static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
+{
+ static siphash_aligned_key_t exp_id_seed;
+ unsigned long a, b, c, d;
+
+ net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
+
+ a = (unsigned long)exp;
+ b = (unsigned long)exp->helper;
+ c = (unsigned long)exp->master;
+ d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed);
+
+#ifdef CONFIG_64BIT
+ return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed);
+#else
+ return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed);
+#endif
+}
static int
ctnetlink_exp_dump_expect(struct sk_buff *skb,
@@ -2701,7 +3012,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
struct nf_conn *master = exp->master;
long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ;
struct nf_conn_help *help;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nlattr *nest_parms;
struct nf_conntrack_tuple nat_tuple = {};
#endif
@@ -2719,10 +3030,10 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
CTA_EXPECT_MASTER) < 0)
goto nla_put_failure;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) ||
exp->saved_proto.all) {
- nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT);
if (!nest_parms)
goto nla_put_failure;
@@ -2741,7 +3052,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
}
#endif
if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
- nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) ||
+ nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) ||
nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
goto nla_put_failure;
@@ -2770,19 +3081,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
int event, const struct nf_conntrack_expect *exp)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -2797,12 +3103,11 @@ nla_put_failure:
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int
-ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
+ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item)
{
struct nf_conntrack_expect *exp = item->exp;
struct net *net = nf_ct_exp_net(exp);
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *skb;
unsigned int type, group;
int flags = 0;
@@ -2825,15 +3130,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
goto errout;
type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
- nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, item->portid, 0, type, flags,
+ exp->tuple.src.l3num, NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = exp->tuple.src.l3num;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (ctnetlink_exp_dump_expect(skb, exp) < 0)
goto nla_put_failure;
@@ -2850,23 +3151,27 @@ errout:
return 0;
}
#endif
-static int ctnetlink_exp_done(struct netlink_callback *cb)
+
+static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp)
{
- if (cb->args[1])
- nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]);
- return 0;
+ unsigned long id = (unsigned long)exp;
+
+ id += nf_ct_get_id(exp->master);
+ id += exp->class;
+
+ return id ? id : 1;
}
static int
ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]],
@@ -2878,7 +3183,7 @@ restart:
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -2887,9 +3192,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -2900,32 +3203,30 @@ restart:
}
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
static int
ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct nf_conntrack_expect *exp, *last;
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
struct nf_conn *ct = cb->data;
struct nf_conn_help *help = nfct_help(ct);
u_int8_t l3proto = nfmsg->nfgen_family;
+ unsigned long last_id = cb->args[1];
+ struct nf_conntrack_expect *exp;
if (cb->args[0])
return 0;
rcu_read_lock();
- last = (struct nf_conntrack_expect *)cb->args[1];
+
restart:
hlist_for_each_entry_rcu(exp, &help->expectations, lnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (cb->args[1]) {
- if (exp != last)
+ if (ctnetlink_exp_id(exp) != last_id)
continue;
cb->args[1] = 0;
}
@@ -2933,9 +3234,7 @@ restart:
cb->nlh->nlmsg_seq,
IPCTNL_MSG_EXP_NEW,
exp) < 0) {
- if (!refcount_inc_not_zero(&exp->use))
- continue;
- cb->args[1] = (unsigned long)exp;
+ cb->args[1] = ctnetlink_exp_id(exp);
goto out;
}
}
@@ -2946,9 +3245,6 @@ restart:
cb->args[0] = 1;
out:
rcu_read_unlock();
- if (last)
- nf_ct_expect_put(last);
-
return skb->len;
}
@@ -2967,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
struct nf_conntrack_zone zone;
struct netlink_dump_control c = {
.dump = ctnetlink_exp_ct_dump_table,
- .done = ctnetlink_exp_done,
};
err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER,
@@ -2998,29 +3293,27 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
return err;
}
-static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_get_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct sk_buff *skb2;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
+ struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
if (cda[CTA_EXPECT_MASTER])
- return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda,
- extack);
+ return ctnetlink_dump_exp_ct(info->net, info->sk, skb,
+ info->nlh, cda,
+ info->extack);
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
- .done = ctnetlink_exp_done,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
}
@@ -3040,54 +3333,52 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
if (err < 0)
return err;
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
- if (ntohl(id) != (u32)(unsigned long)exp) {
+
+ if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp);
return -ENOENT;
}
}
- err = -ENOMEM;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
+ if (!skb2) {
nf_ct_expect_put(exp);
- goto out;
+ return -ENOMEM;
}
rcu_read_lock();
err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp);
+ info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW,
+ exp);
rcu_read_unlock();
nf_ct_expect_put(exp);
- if (err <= 0)
- goto free;
-
- err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (err < 0)
- goto out;
-
- return 0;
+ if (err <= 0) {
+ kfree_skb(skb2);
+ return -ENOMEM;
+ }
-free:
- kfree_skb(skb2);
-out:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
{
+ struct nf_conntrack_helper *helper;
const struct nf_conn_help *m_help;
const char *name = data;
m_help = nfct_help(exp->master);
- return strcmp(m_help->helper->name, name) == 0;
+ helper = rcu_dereference(m_help->helper);
+ if (!helper)
+ return false;
+
+ return strcmp(helper->name, name) == 0;
}
static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
@@ -3095,15 +3386,13 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
return true;
}
-static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_del_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3119,13 +3408,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
return err;
/* bump usage count to 2 */
- exp = nf_ct_expect_find_get(net, &zone, &tuple);
+ exp = nf_ct_expect_find_get(info->net, &zone, &tuple);
if (!exp)
return -ENOENT;
if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
- if (ntohl(id) != (u32)(unsigned long)exp) {
+
+ if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp);
return -ENOENT;
}
@@ -3133,9 +3423,9 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
/* after list removal, usage count == 1 */
spin_lock_bh(&nf_conntrack_expect_lock);
- if (del_timer(&exp->timeout)) {
+ if (timer_delete(&exp->timeout)) {
nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
nf_ct_expect_put(exp);
}
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3145,14 +3435,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
} else if (cda[CTA_EXPECT_HELP_NAME]) {
char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
- nf_ct_expect_iterate_net(net, expect_iter_name, name,
+ nf_ct_expect_iterate_net(info->net, expect_iter_name, name,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
} else {
/* This basically means we have to flush everything*/
- nf_ct_expect_iterate_net(net, expect_iter_all, NULL,
+ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return 0;
@@ -3162,7 +3452,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
const struct nlattr * const cda[])
{
if (cda[CTA_EXPECT_TIMEOUT]) {
- if (!del_timer(&x->timeout))
+ if (!timer_delete(&x->timeout))
return -ETIME;
x->timeout.expires = jiffies +
@@ -3172,23 +3462,25 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x,
return 0;
}
+#if IS_ENABLED(CONFIG_NF_NAT)
static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = {
[CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 },
[CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED },
};
+#endif
static int
ctnetlink_parse_expect_nat(const struct nlattr *attr,
struct nf_conntrack_expect *exp,
u_int8_t u3)
{
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nlattr *tb[CTA_EXPECT_NAT_MAX+1];
struct nf_conntrack_tuple nat_tuple = {};
int err;
- err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr,
- exp_nat_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr,
+ exp_nat_nla_policy, NULL);
if (err < 0)
return err;
@@ -3339,7 +3631,7 @@ ctnetlink_create_expect(struct net *net,
goto err_rcu;
}
- err = nf_ct_expect_related_report(exp, portid, report);
+ err = nf_ct_expect_related_report(exp, portid, report, 0);
nf_ct_expect_put(exp);
err_rcu:
rcu_read_unlock();
@@ -3348,15 +3640,13 @@ err_ct:
return err;
}
-static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_new_expect(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ u_int8_t u3 = info->nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_zone zone;
int err;
@@ -3375,20 +3665,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
return err;
spin_lock_bh(&nf_conntrack_expect_lock);
- exp = __nf_ct_expect_find(net, &zone, &tuple);
+ exp = __nf_ct_expect_find(info->net, &zone, &tuple);
if (!exp) {
spin_unlock_bh(&nf_conntrack_expect_lock);
err = -ENOENT;
- if (nlh->nlmsg_flags & NLM_F_CREATE) {
- err = ctnetlink_create_expect(net, &zone, cda, u3,
+ if (info->nlh->nlmsg_flags & NLM_F_CREATE) {
+ err = ctnetlink_create_expect(info->net, &zone, cda, u3,
NETLINK_CB(skb).portid,
- nlmsg_report(nlh));
+ nlmsg_report(info->nlh));
}
return err;
}
err = -EEXIST;
- if (!(nlh->nlmsg_flags & NLM_F_EXCL))
+ if (!(info->nlh->nlmsg_flags & NLM_F_EXCL))
err = ctnetlink_change_expect(exp, cda);
spin_unlock_bh(&nf_conntrack_expect_lock);
@@ -3400,20 +3690,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
const struct ip_conntrack_stat *st)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0, event;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
IPCTNL_MSG_EXP_GET_STATS_CPU);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, htons(cpu));
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(cpu);
-
if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) ||
nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) ||
nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete)))
@@ -3454,17 +3739,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int ctnetlink_stat_exp_cpu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_stat_cpu_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
return 0;
@@ -3472,44 +3755,77 @@ static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static struct nf_ct_event_notifier ctnl_notifier = {
- .fcn = ctnetlink_conntrack_event,
-};
-
-static struct nf_exp_event_notifier ctnl_notifier_exp = {
- .fcn = ctnetlink_expect_event,
+ .ct_event = ctnetlink_conntrack_event,
+ .exp_event = ctnetlink_expect_event,
};
#endif
static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
- [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack,
- .attr_count = CTA_MAX,
- .policy = ct_nla_policy },
- [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu },
- [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct },
- [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying },
- [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed },
+ [IPCTNL_MSG_CT_NEW] = {
+ .call = ctnetlink_new_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_DELETE] = {
+ .call = ctnetlink_del_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_CTRZERO] = {
+ .call = ctnetlink_get_conntrack,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_MAX,
+ .policy = ct_nla_policy
+ },
+ [IPCTNL_MSG_CT_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_ct_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_STATS] = {
+ .call = ctnetlink_stat_ct,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_DYING] = {
+ .call = ctnetlink_get_ct_dying,
+ .type = NFNL_CB_MUTEX,
+ },
+ [IPCTNL_MSG_CT_GET_UNCONFIRMED] = {
+ .call = ctnetlink_get_ct_unconfirmed,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = {
- [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect,
- .attr_count = CTA_EXPECT_MAX,
- .policy = exp_nla_policy },
- [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu },
+ [IPCTNL_MSG_EXP_GET] = {
+ .call = ctnetlink_get_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_NEW] = {
+ .call = ctnetlink_new_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_DELETE] = {
+ .call = ctnetlink_del_expect,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_EXPECT_MAX,
+ .policy = exp_nla_policy
+ },
+ [IPCTNL_MSG_EXP_GET_STATS_CPU] = {
+ .call = ctnetlink_stat_exp_cpu,
+ .type = NFNL_CB_MUTEX,
+ },
};
static const struct nfnetlink_subsystem ctnl_subsys = {
@@ -3533,55 +3849,29 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP);
static int __net_init ctnetlink_net_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- int ret;
-
- ret = nf_conntrack_register_notifier(net, &ctnl_notifier);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot register notifier.\n");
- goto err_out;
- }
-
- ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp);
- if (ret < 0) {
- pr_err("ctnetlink_init: cannot expect register notifier.\n");
- goto err_unreg_notifier;
- }
+ nf_conntrack_register_notifier(net, &ctnl_notifier);
#endif
return 0;
-
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
-err_unreg_notifier:
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
-err_out:
- return ret;
-#endif
}
-static void ctnetlink_net_exit(struct net *net)
+static void ctnetlink_net_pre_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp);
- nf_conntrack_unregister_notifier(net, &ctnl_notifier);
+ nf_conntrack_unregister_notifier(net);
#endif
}
-static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
-{
- struct net *net;
-
- list_for_each_entry(net, net_exit_list, exit_list)
- ctnetlink_net_exit(net);
-}
-
static struct pernet_operations ctnetlink_net_ops = {
.init = ctnetlink_net_init,
- .exit_batch = ctnetlink_net_exit_batch,
+ .pre_exit = ctnetlink_net_pre_exit,
};
static int __init ctnetlink_init(void)
{
int ret;
+ NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx);
+
ret = nfnetlink_subsys_register(&ctnl_subsys);
if (ret < 0) {
pr_err("ctnetlink_init: cannot register with nfnetlink.\n");
diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c
new file mode 100644
index 000000000000..068e9489e1c2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ovs.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
+#include <net/ip.h>
+#include <linux/netfilter_ipv6.h>
+
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = skb_ip_totlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ len = ntohs(ipv6_hdr(skb)->payload_len);
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) {
+ int err = nf_ip6_check_hbh_len(skb, &len);
+
+ if (err)
+ return err;
+ }
+ len += sizeof(struct ipv6hdr);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ return pskb_trim_rcsum(skb, len);
+}
+EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u16 zone, u8 family, u8 *proto, u16 *mru)
+{
+ int err;
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err)
+ return err;
+
+ *mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ } else if (family == NFPROTO_IPV6) {
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
+ return err;
+ }
+
+ *proto = ipv6_hdr(skb)->nexthdr;
+ *mru = IP6CB(skb)->frag_max_size;
+#endif
+ } else {
+ kfree_skb(skb);
+ return -EPFNOSUPPORT;
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 11562f2a08bb..4c679638df06 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Connection tracking support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
* It is a specification defined by Microsoft and some vendors
* working with Microsoft. PPTP is built on top of a modified
* version of the Internet Generic Routing Encapsulation Protocol.
@@ -44,51 +45,37 @@ MODULE_ALIAS_NFCT_HELPER("pptp");
static DEFINE_SPINLOCK(nf_pptp_lock);
-int
-(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound);
-
-int
-(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb,
- struct nf_conn *ct, enum ip_conntrack_info ctinfo,
- unsigned int protoff, struct PptpControlHeader *ctlh,
- union pptp_ctrl_union *pptpReq) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound);
-
-void
-(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig,
- struct nf_conntrack_expect *expect_reply)
- __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre);
-
-void
-(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct,
- struct nf_conntrack_expect *exp) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
+const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook;
+EXPORT_SYMBOL_GPL(nf_nat_pptp_hook);
#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
/* PptpControlMessageType names */
-const char *const pptp_msg_name[] = {
- "UNKNOWN_MESSAGE",
- "START_SESSION_REQUEST",
- "START_SESSION_REPLY",
- "STOP_SESSION_REQUEST",
- "STOP_SESSION_REPLY",
- "ECHO_REQUEST",
- "ECHO_REPLY",
- "OUT_CALL_REQUEST",
- "OUT_CALL_REPLY",
- "IN_CALL_REQUEST",
- "IN_CALL_REPLY",
- "IN_CALL_CONNECT",
- "CALL_CLEAR_REQUEST",
- "CALL_DISCONNECT_NOTIFY",
- "WAN_ERROR_NOTIFY",
- "SET_LINK_INFO"
+static const char *const pptp_msg_name_array[PPTP_MSG_MAX + 1] = {
+ [0] = "UNKNOWN_MESSAGE",
+ [PPTP_START_SESSION_REQUEST] = "START_SESSION_REQUEST",
+ [PPTP_START_SESSION_REPLY] = "START_SESSION_REPLY",
+ [PPTP_STOP_SESSION_REQUEST] = "STOP_SESSION_REQUEST",
+ [PPTP_STOP_SESSION_REPLY] = "STOP_SESSION_REPLY",
+ [PPTP_ECHO_REQUEST] = "ECHO_REQUEST",
+ [PPTP_ECHO_REPLY] = "ECHO_REPLY",
+ [PPTP_OUT_CALL_REQUEST] = "OUT_CALL_REQUEST",
+ [PPTP_OUT_CALL_REPLY] = "OUT_CALL_REPLY",
+ [PPTP_IN_CALL_REQUEST] = "IN_CALL_REQUEST",
+ [PPTP_IN_CALL_REPLY] = "IN_CALL_REPLY",
+ [PPTP_IN_CALL_CONNECT] = "IN_CALL_CONNECT",
+ [PPTP_CALL_CLEAR_REQUEST] = "CALL_CLEAR_REQUEST",
+ [PPTP_CALL_DISCONNECT_NOTIFY] = "CALL_DISCONNECT_NOTIFY",
+ [PPTP_WAN_ERROR_NOTIFY] = "WAN_ERROR_NOTIFY",
+ [PPTP_SET_LINK_INFO] = "SET_LINK_INFO"
};
+
+const char *pptp_msg_name(u_int16_t msg)
+{
+ if (msg > PPTP_MSG_MAX)
+ return pptp_msg_name_array[0];
+
+ return pptp_msg_name_array[msg];
+}
EXPORT_SYMBOL(pptp_msg_name);
#endif
@@ -102,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name);
static void pptp_expectfn(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
{
+ const struct nf_nat_pptp_hook *hook;
struct net *net = nf_ct_net(ct);
- typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn;
pr_debug("increasing timeouts\n");
/* increase timeout of GRE data channel conntrack entry */
@@ -113,15 +100,15 @@ static void pptp_expectfn(struct nf_conn *ct,
/* Can you see how rusty this code is, compared with the pre-2.6.11
* one? That's what happened to my shiny newnat of 2002 ;( -HW */
- nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn);
- if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK)
- nf_nat_pptp_expectfn(ct, exp);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->master->status & IPS_NAT_MASK)
+ hook->expectfn(ct, exp);
else {
struct nf_conntrack_tuple inv_t;
struct nf_conntrack_expect *exp_other;
/* obviously this tuple inversion only works until you do NAT */
- nf_ct_invert_tuplepr(&inv_t, &exp->tuple);
+ nf_ct_invert_tuple(&inv_t, &exp->tuple);
pr_debug("trying to unexpect other dir: ");
nf_ct_dump_tuple(&inv_t);
@@ -200,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct)
static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
{
struct nf_conntrack_expect *exp_orig, *exp_reply;
+ const struct nf_nat_pptp_hook *hook;
enum ip_conntrack_dir dir;
int ret = 1;
- typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre;
exp_orig = nf_ct_expect_alloc(ct);
if (exp_orig == NULL)
@@ -230,12 +217,12 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
IPPROTO_GRE, &callid, &peer_callid);
exp_reply->expectfn = pptp_expectfn;
- nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre);
- if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK)
- nf_nat_pptp_exp_gre(exp_orig, exp_reply);
- if (nf_ct_expect_related(exp_orig) != 0)
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ hook->exp_gre(exp_orig, exp_reply);
+ if (nf_ct_expect_related(exp_orig, 0) != 0)
goto out_put_both;
- if (nf_ct_expect_related(exp_reply) != 0)
+ if (nf_ct_expect_related(exp_reply, 0) != 0)
goto out_unexpect_orig;
/* Add GRE keymap entries */
@@ -270,12 +257,12 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound;
msg = ntohs(ctlh->messageType);
- pr_debug("inbound control message %s\n", pptp_msg_name[msg]);
+ pr_debug("inbound control message %s\n", pptp_msg_name(msg));
switch (msg) {
case PPTP_START_SESSION_REPLY:
@@ -310,7 +297,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
pcid = pptpReq->ocack.peersCallID;
if (info->pns_call_id != pcid)
goto invalid;
- pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
+ pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name(msg),
ntohs(cid), ntohs(pcid));
if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) {
@@ -327,7 +314,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
cid = pptpReq->icreq.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->cstate = PPTP_CALL_IN_REQ;
info->pac_call_id = cid;
break;
@@ -346,7 +333,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
if (info->pns_call_id != pcid)
goto invalid;
- pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid));
+ pr_debug("%s, PCID=%X\n", pptp_msg_name(msg), ntohs(pcid));
info->cstate = PPTP_CALL_IN_CONF;
/* we expect a GRE connection from PAC to PNS */
@@ -356,7 +343,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
case PPTP_CALL_DISCONNECT_NOTIFY:
/* server confirms disconnect */
cid = pptpReq->disc.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->cstate = PPTP_CALL_NONE;
/* untrack this call id, unexpect GRE packets */
@@ -374,16 +361,15 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound);
- if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_inbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
pr_debug("invalid %s: type=%d cid=%u pcid=%u "
"cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ pptp_msg_name(msg),
msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
ntohs(info->pns_call_id), ntohs(info->pac_call_id));
return NF_ACCEPT;
@@ -398,12 +384,12 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
enum ip_conntrack_info ctinfo)
{
struct nf_ct_pptp_master *info = nfct_help_data(ct);
+ const struct nf_nat_pptp_hook *hook;
u_int16_t msg;
__be16 cid = 0, pcid = 0;
- typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound;
msg = ntohs(ctlh->messageType);
- pr_debug("outbound control message %s\n", pptp_msg_name[msg]);
+ pr_debug("outbound control message %s\n", pptp_msg_name(msg));
switch (msg) {
case PPTP_START_SESSION_REQUEST:
@@ -425,7 +411,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
info->cstate = PPTP_CALL_OUT_REQ;
/* track PNS call id */
cid = pptpReq->ocreq.callID;
- pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid));
+ pr_debug("%s, CID=%X\n", pptp_msg_name(msg), ntohs(cid));
info->pns_call_id = cid;
break;
@@ -439,7 +425,7 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
pcid = pptpReq->icack.peersCallID;
if (info->pac_call_id != pcid)
goto invalid;
- pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg],
+ pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name(msg),
ntohs(cid), ntohs(pcid));
if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) {
@@ -470,16 +456,15 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
goto invalid;
}
- nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound);
- if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK)
- return nf_nat_pptp_outbound(skb, ct, ctinfo,
- protoff, ctlh, pptpReq);
+ hook = rcu_dereference(nf_nat_pptp_hook);
+ if (hook && ct->status & IPS_NAT_MASK)
+ return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq);
return NF_ACCEPT;
invalid:
pr_debug("invalid %s: type=%d cid=%u pcid=%u "
"cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0],
+ pptp_msg_name(msg),
msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate,
ntohs(info->pns_call_id), ntohs(info->pac_call_id));
return NF_ACCEPT;
@@ -535,7 +520,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
nexthdr_off = protoff;
tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph);
- BUG_ON(!tcph);
+ if (!tcph)
+ return NF_ACCEPT;
+
nexthdr_off += tcph->doff * 4;
datalen = tcplen - tcph->doff * 4;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 859f5d07a915..bc1d96686b9c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -16,6 +16,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
#include <net/netfilter/nf_log.h>
#include <linux/ip.h>
@@ -41,52 +42,20 @@
#include <net/ipv6.h>
#include <net/inet_frag.h>
-extern unsigned int nf_conntrack_net_id;
-
-static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly;
-
static DEFINE_MUTEX(nf_ct_proto_mutex);
#ifdef CONFIG_SYSCTL
-static int
-nf_ct_register_sysctl(struct net *net,
- struct ctl_table_header **header,
- const char *path,
- struct ctl_table *table)
-{
- if (*header == NULL) {
- *header = register_net_sysctl(net, path, table);
- if (*header == NULL)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void
-nf_ct_unregister_sysctl(struct ctl_table_header **header,
- struct ctl_table **table,
- unsigned int users)
-{
- if (users > 0)
- return;
-
- unregister_net_sysctl_table(*header);
- kfree(*table);
- *header = NULL;
- *table = NULL;
-}
-
-__printf(5, 6)
+__printf(4, 5)
void nf_l4proto_log_invalid(const struct sk_buff *skb,
- struct net *net,
- u16 pf, u8 protonum,
+ const struct nf_hook_state *state,
+ u8 protonum,
const char *fmt, ...)
{
+ struct net *net = state->net;
struct va_format vaf;
va_list args;
- if (net->ct.sysctl_log_invalid != protonum ||
+ if (net->ct.sysctl_log_invalid != protonum &&
net->ct.sysctl_log_invalid != IPPROTO_RAW)
return;
@@ -94,15 +63,16 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_proto_%d: %pV ", protonum, &vaf);
+ nf_log_packet(net, state->pf, 0, skb, state->in, state->out,
+ NULL, "nf_ct_proto_%d: %pV ", protonum, &vaf);
va_end(args);
}
EXPORT_SYMBOL_GPL(nf_l4proto_log_invalid);
-__printf(3, 4)
+__printf(4, 5)
void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
const struct nf_conn *ct,
+ const struct nf_hook_state *state,
const char *fmt, ...)
{
struct va_format vaf;
@@ -117,303 +87,115 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb,
vaf.fmt = fmt;
vaf.va = &args;
- nf_l4proto_log_invalid(skb, net, nf_ct_l3num(ct),
+ nf_l4proto_log_invalid(skb, state,
nf_ct_protonum(ct), "%pV", &vaf);
va_end(args);
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid);
#endif
-const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto)
-{
- if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos)))
- return &nf_conntrack_l4proto_generic;
-
- return rcu_dereference(nf_ct_protos[l4proto]);
-}
-EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
-
-const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num)
-{
- const struct nf_conntrack_l4proto *p;
-
- rcu_read_lock();
- p = __nf_ct_l4proto_find(l4num);
- if (!try_module_get(p->me))
- p = &nf_conntrack_l4proto_generic;
- rcu_read_unlock();
-
- return p;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
-
-void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
-{
- module_put(p->me);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
-
-static int kill_l4proto(struct nf_conn *i, void *data)
-{
- const struct nf_conntrack_l4proto *l4proto;
- l4proto = data;
- return nf_ct_protonum(i) == l4proto->l4proto;
-}
-
-static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
- const struct nf_conntrack_l4proto *l4proto)
-{
- if (l4proto->get_net_proto) {
- /* statically built-in protocols use static per-net */
- return l4proto->get_net_proto(net);
- } else if (l4proto->net_id) {
- /* ... and loadable protocols use dynamic per-net */
- return net_generic(net, *l4proto->net_id);
- }
- return NULL;
-}
-
-static
-int nf_ct_l4proto_register_sysctl(struct net *net,
- struct nf_proto_net *pn)
-{
- int err = 0;
-
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table != NULL) {
- err = nf_ct_register_sysctl(net,
- &pn->ctl_table_header,
- "net/netfilter",
- pn->ctl_table);
- if (err < 0) {
- if (!pn->users) {
- kfree(pn->ctl_table);
- pn->ctl_table = NULL;
- }
- }
- }
-#endif /* CONFIG_SYSCTL */
- return err;
-}
-
-static
-void nf_ct_l4proto_unregister_sysctl(struct nf_proto_net *pn)
-{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table_header != NULL)
- nf_ct_unregister_sysctl(&pn->ctl_table_header,
- &pn->ctl_table,
- pn->users);
-#endif /* CONFIG_SYSCTL */
-}
-
-/* FIXME: Allow NULL functions and sub in pointers to generic for
- them. --RR */
-int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto)
-{
- int ret = 0;
-
- if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) ||
- (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size))
- return -EINVAL;
-
- mutex_lock(&nf_ct_proto_mutex);
- if (rcu_dereference_protected(
- nf_ct_protos[l4proto->l4proto],
- lockdep_is_held(&nf_ct_proto_mutex)
- ) != &nf_conntrack_l4proto_generic) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto);
-out_unlock:
- mutex_unlock(&nf_ct_proto_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one);
-
-int nf_ct_l4proto_pernet_register_one(struct net *net,
- const struct nf_conntrack_l4proto *l4proto)
-{
- int ret = 0;
- struct nf_proto_net *pn = NULL;
-
- if (l4proto->init_net) {
- ret = l4proto->init_net(net);
- if (ret < 0)
- goto out;
- }
-
- pn = nf_ct_l4proto_net(net, l4proto);
- if (pn == NULL)
- goto out;
-
- ret = nf_ct_l4proto_register_sysctl(net, pn);
- if (ret < 0)
- goto out;
-
- pn->users++;
-out:
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
-
-static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
-
-{
- BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos));
-
- BUG_ON(rcu_dereference_protected(
- nf_ct_protos[l4proto->l4proto],
- lockdep_is_held(&nf_ct_proto_mutex)
- ) != l4proto);
- rcu_assign_pointer(nf_ct_protos[l4proto->l4proto],
- &nf_conntrack_l4proto_generic);
-}
-
-void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
-{
- mutex_lock(&nf_ct_proto_mutex);
- __nf_ct_l4proto_unregister_one(l4proto);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_net();
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
-
-void nf_ct_l4proto_pernet_unregister_one(struct net *net,
- const struct nf_conntrack_l4proto *l4proto)
-{
- struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
-
- if (pn == NULL)
- return;
-
- pn->users--;
- nf_ct_l4proto_unregister_sysctl(pn);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
-
-static void
-nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
-{
- int i;
-
- mutex_lock(&nf_ct_proto_mutex);
- for (i = 0; i < num_proto; i++)
- __nf_ct_l4proto_unregister_one(l4proto[i]);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_net();
-
- for (i = 0; i < num_proto; i++)
- nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto[i]);
-}
-
-static int
-nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
+const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
{
- int ret = -EINVAL;
- unsigned int i;
-
- for (i = 0; i < num_proto; i++) {
- ret = nf_ct_l4proto_register_one(l4proto[i]);
- if (ret < 0)
- break;
- }
- if (i != num_proto) {
- pr_err("nf_conntrack: can't register l4 %d proto.\n",
- l4proto[i]->l4proto);
- nf_ct_l4proto_unregister(l4proto, i);
+ switch (l4proto) {
+ case IPPROTO_UDP: return &nf_conntrack_l4proto_udp;
+ case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp;
+ case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp;
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ case IPPROTO_UDPLITE: return &nf_conntrack_l4proto_udplite;
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: return &nf_conntrack_l4proto_gre;
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6: return &nf_conntrack_l4proto_icmpv6;
+#endif /* CONFIG_IPV6 */
}
- return ret;
-}
-int nf_ct_l4proto_pernet_register(struct net *net,
- const struct nf_conntrack_l4proto *const l4proto[],
- unsigned int num_proto)
-{
- int ret = -EINVAL;
- unsigned int i;
-
- for (i = 0; i < num_proto; i++) {
- ret = nf_ct_l4proto_pernet_register_one(net, l4proto[i]);
- if (ret < 0)
- break;
- }
- if (i != num_proto) {
- pr_err("nf_conntrack %d: pernet registration failed\n",
- l4proto[i]->l4proto);
- nf_ct_l4proto_pernet_unregister(net, l4proto, i);
- }
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
+ return &nf_conntrack_l4proto_generic;
+};
+EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
-void nf_ct_l4proto_pernet_unregister(struct net *net,
- const struct nf_conntrack_l4proto *const l4proto[],
- unsigned int num_proto)
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
{
- while (num_proto-- != 0)
- nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]);
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
-static unsigned int ipv4_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
+unsigned int nf_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
+ enum ip_conntrack_info ctinfo;
+ unsigned int protoff;
+ struct nf_conn *ct;
+ bool seqadj_needed;
+ __be16 frag_off;
+ int start;
+ u8 pnum;
- /* This is where we call the helper: as the packet goes out. */
ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ if (!ct || in_vrf_postrouting(state))
return NF_ACCEPT;
help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
+ seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
+ if (!help && !seqadj_needed)
+ return nf_conntrack_confirm(skb);
- return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
+ /* helper->help() do not expect ICMP packets */
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return nf_conntrack_confirm(skb);
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ pnum = ipv6_hdr(skb)->nexthdr;
+ start = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
+ if (start < 0 || (frag_off & htons(~0x7)) != 0)
+ return nf_conntrack_confirm(skb);
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
+ protoff = start;
+ break;
+ default:
+ return nf_conntrack_confirm(skb);
+ }
+
+ if (help) {
+ const struct nf_conntrack_helper *helper;
+ int ret;
+
+ /* rcu_read_lock()ed by nf_hook */
+ helper = rcu_dereference(help->helper);
+ if (helper) {
+ ret = helper->help(skb,
+ protoff,
+ ct, ctinfo);
+ if (ret != NF_ACCEPT)
+ return ret;
}
}
-out:
+
+ if (seqadj_needed &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+
/* We've seen it coming out the other side: confirm it */
return nf_conntrack_confirm(skb);
}
+EXPORT_SYMBOL_GPL(nf_confirm);
static unsigned int ipv4_conntrack_in(void *priv,
struct sk_buff *skb,
@@ -461,25 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = {
.priority = NF_IP_PRI_CONNTRACK,
},
{
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -511,16 +281,11 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
/* We only do TCP and SCTP at the moment: is there a better way? */
if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ tuple.dst.protonum != IPPROTO_SCTP)
return -ENOPROTOOPT;
- }
- if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
+ if ((unsigned int)*len < sizeof(struct sockaddr_in))
return -EINVAL;
- }
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
if (h) {
@@ -534,17 +299,12 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
.tuple.dst.u3.ip;
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
nf_ct_put(ct);
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
return -EFAULT;
else
return 0;
}
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
return -ENOENT;
}
@@ -587,12 +347,8 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (!h) {
- pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
- &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ if (!h)
return -ENOENT;
- }
ct = nf_ct_tuplehash_to_ctrack(h);
@@ -616,40 +372,6 @@ static struct nf_sockopt_ops so_getorigdst6 = {
.owner = THIS_MODULE,
};
-static unsigned int ipv6_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned char pnum = ipv6_hdr(skb)->nexthdr;
- int protoff;
- __be16 frag_off;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- goto out;
- }
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
static unsigned int ipv6_conntrack_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -664,42 +386,6 @@ static unsigned int ipv6_conntrack_local(void *priv,
return nf_conntrack_in(skb, state);
}
-static unsigned int ipv6_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
- enum ip_conntrack_info ctinfo;
- __be16 frag_off;
- int protoff;
- u8 nexthdr;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
-
- return helper->help(skb, protoff, ct, ctinfo);
-}
-
static const struct nf_hook_ops ipv6_conntrack_ops[] = {
{
.hook = ipv6_conntrack_in,
@@ -714,25 +400,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = {
.priority = NF_IP6_PRI_CONNTRACK,
},
{
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_LAST,
},
{
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
+ .hook = nf_confirm,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_LAST - 1,
@@ -756,12 +430,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto)
return 0;
}
+static struct nf_ct_bridge_info *nf_ct_bridge_info;
+
static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
- bool fixup_needed = false;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ bool fixup_needed = false, retry = true;
int err = 0;
-
+retry:
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
@@ -801,6 +477,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
fixup_needed = true;
break;
#endif
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info) {
+ if (!retry) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ mutex_unlock(&nf_ct_proto_mutex);
+ request_module("nf_conntrack_bridge");
+ retry = false;
+ goto retry;
+ }
+ if (!try_module_get(nf_ct_bridge_info->me)) {
+ err = -EPROTO;
+ goto out_unlock;
+ }
+ cnet->users_bridge++;
+ if (cnet->users_bridge > 1)
+ goto out_unlock;
+
+ err = nf_register_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+ if (err)
+ cnet->users_bridge = 0;
+ else
+ fixup_needed = true;
+ break;
default:
err = -EPROTO;
break;
@@ -808,93 +510,137 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
- if (fixup_needed)
- nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
- (void *)(unsigned long)nfproto, 0, 0);
+ if (fixup_needed) {
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = (void *)(unsigned long)nfproto,
+ };
+ nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
+ }
return err;
}
static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
{
- struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
mutex_lock(&nf_ct_proto_mutex);
switch (nfproto) {
case NFPROTO_IPV4:
- if (cnet->users4 && (--cnet->users4 == 0))
+ if (cnet->users4 && (--cnet->users4 == 0)) {
nf_unregister_net_hooks(net, ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
+ nf_defrag_ipv4_disable(net);
+ }
break;
#if IS_ENABLED(CONFIG_IPV6)
case NFPROTO_IPV6:
- if (cnet->users6 && (--cnet->users6 == 0))
+ if (cnet->users6 && (--cnet->users6 == 0)) {
nf_unregister_net_hooks(net, ipv6_conntrack_ops,
ARRAY_SIZE(ipv6_conntrack_ops));
+ nf_defrag_ipv6_disable(net);
+ }
break;
#endif
- }
+ case NFPROTO_BRIDGE:
+ if (!nf_ct_bridge_info)
+ break;
+ if (cnet->users_bridge && (--cnet->users_bridge == 0))
+ nf_unregister_net_hooks(net, nf_ct_bridge_info->ops,
+ nf_ct_bridge_info->ops_size);
+ module_put(nf_ct_bridge_info->me);
+ break;
+ }
mutex_unlock(&nf_ct_proto_mutex);
}
-int nf_ct_netns_get(struct net *net, u8 nfproto)
+static int nf_ct_netns_inet_get(struct net *net)
{
int err;
- if (nfproto == NFPROTO_INET) {
- err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_do_get(net, nfproto);
- if (err < 0)
- goto err1;
- }
- return 0;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ return err;
err2:
nf_ct_netns_put(net, NFPROTO_IPV4);
err1:
+#endif
+ return err;
+}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ switch (nfproto) {
+ case NFPROTO_INET:
+ err = nf_ct_netns_inet_get(net);
+ break;
+ case NFPROTO_BRIDGE:
+ err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE);
+ if (err < 0)
+ return err;
+
+ err = nf_ct_netns_inet_get(net);
+ if (err < 0) {
+ nf_ct_netns_put(net, NFPROTO_BRIDGE);
+ return err;
+ }
+ break;
+ default:
+ err = nf_ct_netns_do_get(net, nfproto);
+ break;
+ }
return err;
}
EXPORT_SYMBOL_GPL(nf_ct_netns_get);
void nf_ct_netns_put(struct net *net, uint8_t nfproto)
{
- if (nfproto == NFPROTO_INET) {
+ switch (nfproto) {
+ case NFPROTO_BRIDGE:
+ nf_ct_netns_do_put(net, NFPROTO_BRIDGE);
+ fallthrough;
+ case NFPROTO_INET:
nf_ct_netns_do_put(net, NFPROTO_IPV4);
nf_ct_netns_do_put(net, NFPROTO_IPV6);
- } else {
+ break;
+ default:
nf_ct_netns_do_put(net, nfproto);
+ break;
}
}
EXPORT_SYMBOL_GPL(nf_ct_netns_put);
-static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
- &nf_conntrack_l4proto_tcp,
- &nf_conntrack_l4proto_udp,
- &nf_conntrack_l4proto_icmp,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite,
-#endif
-#if IS_ENABLED(CONFIG_IPV6)
- &nf_conntrack_l4proto_icmpv6,
-#endif /* CONFIG_IPV6 */
-};
+void nf_ct_bridge_register(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = info;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_register);
+
+void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info)
+{
+ WARN_ON(!nf_ct_bridge_info);
+ mutex_lock(&nf_ct_proto_mutex);
+ nf_ct_bridge_info = NULL;
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister);
int nf_conntrack_proto_init(void)
{
- int ret = 0, i;
+ int ret;
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0)
@@ -906,21 +652,11 @@ int nf_conntrack_proto_init(void)
goto cleanup_sockopt;
#endif
- for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
- RCU_INIT_POINTER(nf_ct_protos[i],
- &nf_conntrack_l4proto_generic);
-
- ret = nf_ct_l4proto_register(builtin_l4proto,
- ARRAY_SIZE(builtin_l4proto));
- if (ret < 0)
- goto cleanup_sockopt2;
-
return ret;
-cleanup_sockopt2:
- nf_unregister_sockopt(&so_getorigdst);
+
#if IS_ENABLED(CONFIG_IPV6)
cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst6);
+ nf_unregister_sockopt(&so_getorigdst);
#endif
return ret;
}
@@ -933,43 +669,23 @@ void nf_conntrack_proto_fini(void)
#endif
}
-int nf_conntrack_proto_pernet_init(struct net *net)
+void nf_conntrack_proto_pernet_init(struct net *net)
{
- int err;
- struct nf_proto_net *pn = nf_ct_l4proto_net(net,
- &nf_conntrack_l4proto_generic);
-
- err = nf_conntrack_l4proto_generic.init_net(net);
- if (err < 0)
- return err;
- err = nf_ct_l4proto_register_sysctl(net,
- pn);
- if (err < 0)
- return err;
-
- err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
- ARRAY_SIZE(builtin_l4proto));
- if (err < 0) {
- nf_ct_l4proto_unregister_sysctl(pn);
- return err;
- }
-
- pn->users++;
- return 0;
-}
-
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
- struct nf_proto_net *pn = nf_ct_l4proto_net(net,
- &nf_conntrack_l4proto_generic);
-
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
- ARRAY_SIZE(builtin_l4proto));
- pn->users--;
- nf_ct_l4proto_unregister_sysctl(pn);
+ nf_conntrack_generic_init_net(net);
+ nf_conntrack_udp_init_net(net);
+ nf_conntrack_tcp_init_net(net);
+ nf_conntrack_icmp_init_net(net);
+#if IS_ENABLED(CONFIG_IPV6)
+ nf_conntrack_icmpv6_init_net(net);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ nf_conntrack_sctp_init_net(net);
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ nf_conntrack_gre_init_net(net);
+#endif
}
-
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
@@ -977,3 +693,4 @@ MODULE_ALIAS("ip_conntrack");
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 and IPv6 connection tracking");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
deleted file mode 100644
index 023c1445bc39..000000000000
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ /dev/null
@@ -1,867 +0,0 @@
-/*
- * DCCP connection tracking protocol helper
- *
- * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sysctl.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/dccp.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_timeout.h>
-#include <net/netfilter/nf_log.h>
-
-/* Timeouts are based on values from RFC4340:
- *
- * - REQUEST:
- *
- * 8.1.2. Client Request
- *
- * A client MAY give up on its DCCP-Requests after some time
- * (3 minutes, for example).
- *
- * - RESPOND:
- *
- * 8.1.3. Server Response
- *
- * It MAY also leave the RESPOND state for CLOSED after a timeout of
- * not less than 4MSL (8 minutes);
- *
- * - PARTOPEN:
- *
- * 8.1.5. Handshake Completion
- *
- * If the client remains in PARTOPEN for more than 4MSL (8 minutes),
- * it SHOULD reset the connection with Reset Code 2, "Aborted".
- *
- * - OPEN:
- *
- * The DCCP timestamp overflows after 11.9 hours. If the connection
- * stays idle this long the sequence number won't be recognized
- * as valid anymore.
- *
- * - CLOSEREQ/CLOSING:
- *
- * 8.3. Termination
- *
- * The retransmission timer should initially be set to go off in two
- * round-trip times and should back off to not less than once every
- * 64 seconds ...
- *
- * - TIMEWAIT:
- *
- * 4.3. States
- *
- * A server or client socket remains in this state for 2MSL (4 minutes)
- * after the connection has been town down, ...
- */
-
-#define DCCP_MSL (2 * 60 * HZ)
-
-static const char * const dccp_state_names[] = {
- [CT_DCCP_NONE] = "NONE",
- [CT_DCCP_REQUEST] = "REQUEST",
- [CT_DCCP_RESPOND] = "RESPOND",
- [CT_DCCP_PARTOPEN] = "PARTOPEN",
- [CT_DCCP_OPEN] = "OPEN",
- [CT_DCCP_CLOSEREQ] = "CLOSEREQ",
- [CT_DCCP_CLOSING] = "CLOSING",
- [CT_DCCP_TIMEWAIT] = "TIMEWAIT",
- [CT_DCCP_IGNORE] = "IGNORE",
- [CT_DCCP_INVALID] = "INVALID",
-};
-
-#define sNO CT_DCCP_NONE
-#define sRQ CT_DCCP_REQUEST
-#define sRS CT_DCCP_RESPOND
-#define sPO CT_DCCP_PARTOPEN
-#define sOP CT_DCCP_OPEN
-#define sCR CT_DCCP_CLOSEREQ
-#define sCG CT_DCCP_CLOSING
-#define sTW CT_DCCP_TIMEWAIT
-#define sIG CT_DCCP_IGNORE
-#define sIV CT_DCCP_INVALID
-
-/*
- * DCCP state transition table
- *
- * The assumption is the same as for TCP tracking:
- *
- * We are the man in the middle. All the packets go through us but might
- * get lost in transit to the destination. It is assumed that the destination
- * can't receive segments we haven't seen.
- *
- * The following states exist:
- *
- * NONE: Initial state, expecting Request
- * REQUEST: Request seen, waiting for Response from server
- * RESPOND: Response from server seen, waiting for Ack from client
- * PARTOPEN: Ack after Response seen, waiting for packet other than Response,
- * Reset or Sync from server
- * OPEN: Packet other than Response, Reset or Sync seen
- * CLOSEREQ: CloseReq from server seen, expecting Close from client
- * CLOSING: Close seen, expecting Reset
- * TIMEWAIT: Reset seen
- * IGNORE: Not determinable whether packet is valid
- *
- * Some states exist only on one side of the connection: REQUEST, RESPOND,
- * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
- * the one it was in before.
- *
- * Packets are marked as ignored (sIG) if we don't know if they're valid
- * (for example a reincarnation of a connection we didn't notice is dead
- * already) and the server may send back a connection closing Reset or a
- * Response. They're also used for Sync/SyncAck packets, which we don't
- * care about.
- */
-static const u_int8_t
-dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
- [CT_DCCP_ROLE_CLIENT] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sRQ Regular Request
- * sRQ -> sRQ Retransmitted Request or reincarnation
- * sRS -> sRS Retransmitted Request (apparently Response
- * got lost after we saw it) or reincarnation
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation
- *
- * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
- sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, might be response to ignored Request
- * sRS -> sIG Ignore, might be response to ignored Request
- * sPO -> sIG Ignore, might be response to ignored Request
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, reincarnation in reverse direction
- * goes through sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN
- * sOP -> sOP Regular ACK, remain in OPEN
- * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet
- * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.)
- * sPO -> sPO Remain in PARTOPEN state
- * sOP -> sOP Regular DataAck packet in OPEN state
- * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.)
- * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * CLOSEREQ may only be sent by the server.
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sCG Client-initiated close
- * sOP -> sCG Client-initiated close
- * sCR -> sCG Close in response to CloseReq (8.3.)
- * sCG -> sCG Retransmit
- * sTW -> sIV Late retransmit, already in TIME_WAIT
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.)
- * sRS -> sTW Response received without Request
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.)
- * sOP -> sTW Connection reset
- * sCR -> sTW Connection reset
- * sCG -> sTW Connection reset
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
- [CT_DCCP_ROLE_SERVER] = {
- [DCCP_PKT_REQUEST] = {
- /*
- * sNO -> sIV Invalid
- * sRQ -> sIG Ignore, conntrack might be out of sync
- * sRS -> sIG Ignore, conntrack might be out of sync
- * sPO -> sIG Ignore, conntrack might be out of sync
- * sOP -> sIG Ignore, conntrack might be out of sync
- * sCR -> sIG Ignore, conntrack might be out of sync
- * sCG -> sIG Ignore, conntrack might be out of sync
- * sTW -> sRQ Reincarnation, must reverse roles
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
- },
- [DCCP_PKT_RESPONSE] = {
- /*
- * sNO -> sIV Response without Request
- * sRQ -> sRS Response to clients Request
- * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT)
- * sPO -> sIG Response to an ignored Request or late retransmit
- * sOP -> sIG Ignore, might be response to ignored Request
- * sCR -> sIG Ignore, might be response to ignored Request
- * sCG -> sIG Ignore, might be response to ignored Request
- * sTW -> sIV Invalid, Request from client in sTW moves to sRQ
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
- },
- [DCCP_PKT_ACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Ack in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Ack in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATA] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular Data packet in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_DATAACK] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP Enter OPEN state (8.1.5.)
- * sOP -> sOP Regular DataAck in OPEN state
- * sCR -> sIV Waiting for Close from client
- * sCG -> sCG Data in CLOSING MAY be processed (8.3.)
- * sTW -> sIV
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
- },
- [DCCP_PKT_CLOSEREQ] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.)
- * sOP -> sCR CloseReq in OPEN state
- * sCR -> sCR Retransmit
- * sCG -> sCR Simultaneous close, client sends another Close
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
- },
- [DCCP_PKT_CLOSE] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sIV No connection
- * sRS -> sIV No connection
- * sPO -> sOP -> sCG Move direcly to CLOSING
- * sOP -> sCG Move to CLOSING
- * sCR -> sIV Close after CloseReq is invalid
- * sCG -> sCG Retransmit
- * sTW -> sIV Already closed
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
- },
- [DCCP_PKT_RESET] = {
- /*
- * sNO -> sIV No connection
- * sRQ -> sTW Reset in response to Request
- * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.)
- * sOP -> sTW
- * sCR -> sTW
- * sCG -> sTW
- * sTW -> sIG Ignore (don't refresh timer)
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
- sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
- },
- [DCCP_PKT_SYNC] = {
- /*
- * We currently ignore Sync packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- [DCCP_PKT_SYNCACK] = {
- /*
- * We currently ignore SyncAck packets
- *
- * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
- },
- },
-};
-
-static noinline bool
-dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- const struct dccp_hdr *dh)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_dccp_net *dn;
- const char *msg;
- u_int8_t state;
-
- state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
- switch (state) {
- default:
- dn = nf_dccp_pernet(net);
- if (dn->dccp_loose == 0) {
- msg = "not picking up existing connection ";
- goto out_invalid;
- }
- case CT_DCCP_REQUEST:
- break;
- case CT_DCCP_INVALID:
- msg = "invalid state transition ";
- goto out_invalid;
- }
-
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.state = CT_DCCP_NONE;
- ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
- ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
- ct->proto.dccp.handshake_seq = 0;
- return true;
-
-out_invalid:
- nf_ct_l4proto_log_invalid(skb, ct, "%s", msg);
- return false;
-}
-
-static u64 dccp_ack_seq(const struct dccp_hdr *dh)
-{
- const struct dccp_hdr_ack_bits *dhack;
-
- dhack = (void *)dh + __dccp_basic_hdr_len(dh);
- return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
- ntohl(dhack->dccph_ack_nr_low);
-}
-
-static bool dccp_error(const struct dccp_hdr *dh,
- struct sk_buff *skb, unsigned int dataoff,
- const struct nf_hook_state *state)
-{
- unsigned int dccp_len = skb->len - dataoff;
- unsigned int cscov;
- const char *msg;
-
- if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
- dh->dccph_doff * 4 > dccp_len) {
- msg = "nf_ct_dccp: truncated/malformed packet ";
- goto out_invalid;
- }
-
- cscov = dccp_len;
- if (dh->dccph_cscov) {
- cscov = (dh->dccph_cscov - 1) * 4;
- if (cscov > dccp_len) {
- msg = "nf_ct_dccp: bad checksum coverage ";
- goto out_invalid;
- }
- }
-
- if (state->hook == NF_INET_PRE_ROUTING &&
- state->net->ct.sysctl_checksum &&
- nf_checksum_partial(skb, state->hook, dataoff, cscov,
- IPPROTO_DCCP, state->pf)) {
- msg = "nf_ct_dccp: bad checksum ";
- goto out_invalid;
- }
-
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- msg = "nf_ct_dccp: reserved packet type ";
- goto out_invalid;
- }
- return false;
-out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_DCCP, "%s", msg);
- return true;
-}
-
-static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
- unsigned int dataoff, enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
-{
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- struct dccp_hdr _dh, *dh;
- u_int8_t type, old_state, new_state;
- enum ct_dccp_roles role;
- unsigned int *timeouts;
-
- dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
- if (!dh)
- return NF_DROP;
-
- if (dccp_error(dh, skb, dataoff, state))
- return -NF_ACCEPT;
-
- type = dh->dccph_type;
- if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh))
- return -NF_ACCEPT;
-
- if (type == DCCP_PKT_RESET &&
- !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- /* Tear down connection immediately if only reply is a RESET */
- nf_ct_kill_acct(ct, ctinfo, skb);
- return NF_ACCEPT;
- }
-
- spin_lock_bh(&ct->lock);
-
- role = ct->proto.dccp.role[dir];
- old_state = ct->proto.dccp.state;
- new_state = dccp_state_table[role][type][old_state];
-
- switch (new_state) {
- case CT_DCCP_REQUEST:
- if (old_state == CT_DCCP_TIMEWAIT &&
- role == CT_DCCP_ROLE_SERVER) {
- /* Reincarnation in the reverse direction: reopen and
- * reverse client/server roles. */
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
- }
- break;
- case CT_DCCP_RESPOND:
- if (old_state == CT_DCCP_REQUEST)
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- break;
- case CT_DCCP_PARTOPEN:
- if (old_state == CT_DCCP_RESPOND &&
- type == DCCP_PKT_ACK &&
- dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
- set_bit(IPS_ASSURED_BIT, &ct->status);
- break;
- case CT_DCCP_IGNORE:
- /*
- * Connection tracking might be out of sync, so we ignore
- * packets that might establish a new connection and resync
- * if the server responds with a valid Response.
- */
- if (ct->proto.dccp.last_dir == !dir &&
- ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
- type == DCCP_PKT_RESPONSE) {
- ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
- new_state = CT_DCCP_RESPOND;
- break;
- }
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
-
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid packet");
- return NF_ACCEPT;
- case CT_DCCP_INVALID:
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "%s", "invalid state transition");
- return -NF_ACCEPT;
- }
-
- ct->proto.dccp.last_dir = dir;
- ct->proto.dccp.last_pkt = type;
- ct->proto.dccp.state = new_state;
- spin_unlock_bh(&ct->lock);
-
- if (new_state != old_state)
- nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
-
- timeouts = nf_ct_timeout_lookup(ct);
- if (!timeouts)
- timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout;
- nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
-
- return NF_ACCEPT;
-}
-
-static bool dccp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.dccp.state) {
- case CT_DCCP_CLOSEREQ:
- case CT_DCCP_CLOSING:
- case CT_DCCP_TIMEWAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
-{
- seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
-{
- struct nlattr *nest_parms;
-
- spin_lock_bh(&ct->lock);
- nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED);
- if (!nest_parms)
- goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
- nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
- cpu_to_be64(ct->proto.dccp.handshake_seq),
- CTA_PROTOINFO_DCCP_PAD))
- goto nla_put_failure;
- nla_nest_end(skb, nest_parms);
- spin_unlock_bh(&ct->lock);
- return 0;
-
-nla_put_failure:
- spin_unlock_bh(&ct->lock);
- return -1;
-}
-
-static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
- [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
- [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
- [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC },
-};
-
-#define DCCP_NLATTR_SIZE ( \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + 1) + \
- NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \
- NLA_ALIGN(NLA_HDRLEN + 0))
-
-static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
-{
- struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
- struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
- int err;
-
- if (!attr)
- return 0;
-
- err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr,
- dccp_nla_policy, NULL);
- if (err < 0)
- return err;
-
- if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
- !tb[CTA_PROTOINFO_DCCP_ROLE] ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
- nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
- return -EINVAL;
- }
-
- spin_lock_bh(&ct->lock);
- ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
- if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
- } else {
- ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
- ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
- }
- if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
- ct->proto.dccp.handshake_seq =
- be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
- }
- spin_unlock_bh(&ct->lock);
- return 0;
-}
-#endif
-
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
- struct net *net, void *data)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
- unsigned int *timeouts = data;
- int i;
-
- /* set default DCCP timeouts. */
- for (i=0; i<CT_DCCP_MAX; i++)
- timeouts[i] = dn->dccp_timeout[i];
-
- /* there's a 1:1 mapping between attributes and protocol states. */
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (tb[i]) {
- timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
- }
- }
-
- timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST];
- return 0;
-}
-
-static int
-dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
- const unsigned int *timeouts = data;
- int i;
-
- for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
- if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
- goto nla_put_failure;
- }
- return 0;
-
-nla_put_failure:
- return -ENOSPC;
-}
-
-static const struct nla_policy
-dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
- [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 },
- [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-/* template, data assigned later */
-static struct ctl_table dccp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_dccp_timeout_request",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_respond",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_partopen",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_open",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_closereq",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_closing",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_timeout_timewait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_dccp_loose",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
- struct nf_dccp_net *dn)
-{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table)
- return 0;
-
- pn->ctl_table = kmemdup(dccp_sysctl_table,
- sizeof(dccp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST];
- pn->ctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND];
- pn->ctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN];
- pn->ctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN];
- pn->ctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ];
- pn->ctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING];
- pn->ctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT];
- pn->ctl_table[7].data = &dn->dccp_loose;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- pn->ctl_table[0].procname = NULL;
-#endif
- return 0;
-}
-
-static int dccp_init_net(struct net *net)
-{
- struct nf_dccp_net *dn = nf_dccp_pernet(net);
- struct nf_proto_net *pn = &dn->pn;
-
- if (!pn->users) {
- /* default values */
- dn->dccp_loose = 1;
- dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL;
- dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ;
- dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL;
-
- /* timeouts[0] is unused, make it same as SYN_SENT so
- * ->timeouts[0] contains 'new' timeout, like udp or icmp.
- */
- dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST];
- }
-
- return dccp_kmemdup_sysctl_table(net, pn, dn);
-}
-
-static struct nf_proto_net *dccp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.dccp.pn;
-}
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
- .l4proto = IPPROTO_DCCP,
- .packet = dccp_packet,
- .can_early_drop = dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
- .print_conntrack = dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_size = DCCP_NLATTR_SIZE,
- .to_nlattr = dccp_to_nlattr,
- .from_nlattr = nlattr_to_dccp,
- .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
- .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
- .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
- .nla_policy = nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- .ctnl_timeout = {
- .nlattr_to_obj = dccp_timeout_nlattr_to_obj,
- .obj_to_nlattr = dccp_timeout_obj_to_nlattr,
- .nlattr_max = CTA_TIMEOUT_DCCP_MAX,
- .obj_size = sizeof(unsigned int) * CT_DCCP_MAX,
- .nla_policy = dccp_timeout_nla_policy,
- },
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = dccp_init_net,
- .get_net_proto = dccp_get_net_proto,
-};
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 5da19d5fbc76..e831637bc8ca 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -15,50 +12,6 @@
static const unsigned int nf_ct_generic_timeout = 600*HZ;
-static bool nf_generic_should_process(u8 proto)
-{
- switch (proto) {
-#ifdef CONFIG_NF_CT_PROTO_GRE_MODULE
- case IPPROTO_GRE:
- return false;
-#endif
- default:
- return true;
- }
-}
-
-static bool generic_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
-
- return true;
-}
-
-/* Returns verdict for packet, or -1 for invalid. */
-static int generic_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
-{
- const unsigned int *timeout = nf_ct_timeout_lookup(ct);
-
- if (!nf_generic_should_process(nf_ct_protonum(ct))) {
- pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n",
- nf_ct_protonum(ct));
- return -NF_ACCEPT;
- }
-
- if (!timeout)
- timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout;
-
- nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
- return NF_ACCEPT;
-}
-
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
@@ -104,53 +57,16 @@ generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table generic_sysctl_table[] = {
- {
- .procname = "nf_conntrack_generic_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_generic_net *gn)
-{
-#ifdef CONFIG_SYSCTL
- pn->ctl_table = kmemdup(generic_sysctl_table,
- sizeof(generic_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &gn->timeout;
-#endif
- return 0;
-}
-
-static int generic_init_net(struct net *net)
+void nf_conntrack_generic_init_net(struct net *net)
{
struct nf_generic_net *gn = nf_generic_pernet(net);
- struct nf_proto_net *pn = &gn->pn;
gn->timeout = nf_ct_generic_timeout;
-
- return generic_kmemdup_sysctl_table(pn, gn);
-}
-
-static struct nf_proto_net *generic_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.generic.pn;
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
{
.l4proto = 255,
- .pkt_to_tuple = generic_pkt_to_tuple,
- .packet = generic_packet,
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = generic_timeout_nlattr_to_obj,
@@ -160,6 +76,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
.nla_policy = generic_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = generic_init_net,
- .get_net_proto = generic_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 8899b51aad44..af369e686fc5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * ip_conntrack_proto_gre.c - Version 3.0
- *
* Connection tracking protocol helper module for GRE.
*
* GRE is a generic encapsulation protocol, which is generally not very
@@ -48,24 +47,12 @@ static const unsigned int gre_timeouts[GRE_CT_MAX] = {
[GRE_CT_REPLIED] = 180*HZ,
};
-static unsigned int proto_gre_net_id __read_mostly;
+/* used when expectation is added */
+static DEFINE_SPINLOCK(keymap_lock);
-static inline struct netns_proto_gre *gre_pernet(struct net *net)
+static inline struct nf_gre_net *gre_pernet(struct net *net)
{
- return net_generic(net, proto_gre_net_id);
-}
-
-static void nf_ct_gre_keymap_flush(struct net *net)
-{
- struct netns_proto_gre *net_gre = gre_pernet(net);
- struct nf_ct_gre_keymap *km, *tmp;
-
- write_lock_bh(&net_gre->keymap_lock);
- list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
- list_del(&km->list);
- kfree(km);
- }
- write_unlock_bh(&net_gre->keymap_lock);
+ return &net->ct.nf_ct_proto.gre;
}
static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
@@ -81,18 +68,16 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
/* look up the source key for a given tuple */
static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
{
- struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_gre_net *net_gre = gre_pernet(net);
struct nf_ct_gre_keymap *km;
__be16 key = 0;
- read_lock_bh(&net_gre->keymap_lock);
- list_for_each_entry(km, &net_gre->keymap_list, list) {
+ list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
if (gre_key_cmpfn(km, t)) {
key = km->tuple.src.u.gre.key;
break;
}
}
- read_unlock_bh(&net_gre->keymap_lock);
pr_debug("lookup src key 0x%x for ", key);
nf_ct_dump_tuple(t);
@@ -105,21 +90,17 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
struct nf_conntrack_tuple *t)
{
struct net *net = nf_ct_net(ct);
- struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_gre_net *net_gre = gre_pernet(net);
struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
struct nf_ct_gre_keymap **kmp, *km;
kmp = &ct_pptp_info->keymap[dir];
if (*kmp) {
/* check whether it's a retransmission */
- read_lock_bh(&net_gre->keymap_lock);
- list_for_each_entry(km, &net_gre->keymap_list, list) {
- if (gre_key_cmpfn(km, t) && km == *kmp) {
- read_unlock_bh(&net_gre->keymap_lock);
+ list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
+ if (gre_key_cmpfn(km, t) && km == *kmp)
return 0;
- }
}
- read_unlock_bh(&net_gre->keymap_lock);
pr_debug("trying to override keymap_%s for ct %p\n",
dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
return -EEXIST;
@@ -134,9 +115,9 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
pr_debug("adding new entry %p: ", km);
nf_ct_dump_tuple(&km->tuple);
- write_lock_bh(&net_gre->keymap_lock);
+ spin_lock_bh(&keymap_lock);
list_add_tail(&km->list, &net_gre->keymap_list);
- write_unlock_bh(&net_gre->keymap_lock);
+ spin_unlock_bh(&keymap_lock);
return 0;
}
@@ -145,32 +126,30 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
/* destroy the keymap entries associated with specified master ct */
void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
- struct netns_proto_gre *net_gre = gre_pernet(net);
struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
enum ip_conntrack_dir dir;
pr_debug("entering for ct %p\n", ct);
- write_lock_bh(&net_gre->keymap_lock);
+ spin_lock_bh(&keymap_lock);
for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) {
if (ct_pptp_info->keymap[dir]) {
pr_debug("removing %p from list\n",
ct_pptp_info->keymap[dir]);
- list_del(&ct_pptp_info->keymap[dir]->list);
- kfree(ct_pptp_info->keymap[dir]);
+ list_del_rcu(&ct_pptp_info->keymap[dir]->list);
+ kfree_rcu(ct_pptp_info->keymap[dir], rcu);
ct_pptp_info->keymap[dir] = NULL;
}
}
- write_unlock_bh(&net_gre->keymap_lock);
+ spin_unlock_bh(&keymap_lock);
}
EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
/* gre hdr info to tuple */
-static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
+bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct pptp_gre_header *pgrehdr;
struct pptp_gre_header _pgrehdr;
@@ -216,18 +195,17 @@ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
static unsigned int *gre_get_timeouts(struct net *net)
{
- return gre_pernet(net)->gre_timeouts;
+ return gre_pernet(net)->timeouts;
}
/* Returns verdict for packet, and may modify conntrack */
-static int gre_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_gre_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
- if (state->pf != NFPROTO_IPV4)
- return -NF_ACCEPT;
+ unsigned long status;
if (!nf_ct_is_confirmed(ct)) {
unsigned int *timeouts = nf_ct_timeout_lookup(ct);
@@ -241,11 +219,17 @@ static int gre_packet(struct nf_conn *ct,
ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED];
}
+ status = READ_ONCE(ct->status);
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
- if (ct->status & IPS_SEEN_REPLY) {
+ if (status & IPS_SEEN_REPLY) {
nf_ct_refresh_acct(ct, ctinfo, skb,
ct->proto.gre.stream_timeout);
+
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe. */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
@@ -256,19 +240,6 @@ static int gre_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Called when a conntrack entry has already been removed from the hashes
- * and is about to be deleted from memory */
-static void gre_destroy(struct nf_conn *ct)
-{
- struct nf_conn *master = ct->master;
- pr_debug(" entering\n");
-
- if (!master)
- pr_debug("no master !?!\n");
- else
- nf_ct_gre_keymap_destroy(master);
-}
-
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
#include <linux/netfilter/nfnetlink.h>
@@ -278,13 +249,13 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
unsigned int *timeouts = data;
- struct netns_proto_gre *net_gre = gre_pernet(net);
+ struct nf_gre_net *net_gre = gre_pernet(net);
if (!timeouts)
timeouts = gre_get_timeouts(net);
/* set default timeouts for GRE. */
- timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
- timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
+ timeouts[GRE_CT_UNREPLIED] = net_gre->timeouts[GRE_CT_UNREPLIED];
+ timeouts[GRE_CT_REPLIED] = net_gre->timeouts[GRE_CT_REPLIED];
if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) {
timeouts[GRE_CT_UNREPLIED] =
@@ -320,69 +291,23 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table gre_sysctl_table[] = {
- {
- .procname = "nf_conntrack_gre_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_gre_timeout_stream",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {}
-};
-#endif
-
-static int gre_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *nf,
- struct netns_proto_gre *net_gre)
+void nf_conntrack_gre_init_net(struct net *net)
{
-#ifdef CONFIG_SYSCTL
+ struct nf_gre_net *net_gre = gre_pernet(net);
int i;
- if (nf->ctl_table)
- return 0;
-
- nf->ctl_table = kmemdup(gre_sysctl_table,
- sizeof(gre_sysctl_table),
- GFP_KERNEL);
- if (!nf->ctl_table)
- return -ENOMEM;
-
- for (i = 0; i < GRE_CT_MAX; i++)
- nf->ctl_table[i].data = &net_gre->gre_timeouts[i];
-#endif
- return 0;
-}
-
-static int gre_init_net(struct net *net)
-{
- struct netns_proto_gre *net_gre = gre_pernet(net);
- struct nf_proto_net *nf = &net_gre->nf;
- int i;
-
- rwlock_init(&net_gre->keymap_lock);
INIT_LIST_HEAD(&net_gre->keymap_list);
for (i = 0; i < GRE_CT_MAX; i++)
- net_gre->gre_timeouts[i] = gre_timeouts[i];
-
- return gre_kmemdup_sysctl_table(net, nf, net_gre);
+ net_gre->timeouts[i] = gre_timeouts[i];
}
/* protocol helper struct */
-static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
+const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = {
.l4proto = IPPROTO_GRE,
- .pkt_to_tuple = gre_pkt_to_tuple,
+ .allow_clash = true,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = gre_print_conntrack,
#endif
- .packet = gre_packet,
- .destroy = gre_destroy,
- .me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
@@ -398,61 +323,4 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
.nla_policy = gre_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .net_id = &proto_gre_net_id,
- .init_net = gre_init_net,
-};
-
-static int proto_gre_net_init(struct net *net)
-{
- int ret = 0;
-
- ret = nf_ct_l4proto_pernet_register_one(net,
- &nf_conntrack_l4proto_gre4);
- if (ret < 0)
- pr_err("nf_conntrack_gre4: pernet registration failed.\n");
- return ret;
-}
-
-static void proto_gre_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister_one(net, &nf_conntrack_l4proto_gre4);
- nf_ct_gre_keymap_flush(net);
-}
-
-static struct pernet_operations proto_gre_net_ops = {
- .init = proto_gre_net_init,
- .exit = proto_gre_net_exit,
- .id = &proto_gre_net_id,
- .size = sizeof(struct netns_proto_gre),
};
-
-static int __init nf_ct_proto_gre_init(void)
-{
- int ret;
-
- BUILD_BUG_ON(offsetof(struct netns_proto_gre, nf) != 0);
-
- ret = register_pernet_subsys(&proto_gre_net_ops);
- if (ret < 0)
- goto out_pernet;
- ret = nf_ct_l4proto_register_one(&nf_conntrack_l4proto_gre4);
- if (ret < 0)
- goto out_gre4;
-
- return 0;
-out_gre4:
- unregister_pernet_subsys(&proto_gre_net_ops);
-out_pernet:
- return ret;
-}
-
-static void __exit nf_ct_proto_gre_fini(void)
-{
- nf_ct_l4proto_unregister_one(&nf_conntrack_l4proto_gre4);
- unregister_pernet_subsys(&proto_gre_net_ops);
-}
-
-module_init(nf_ct_proto_gre_init);
-module_exit(nf_ct_proto_gre_fini);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index de64d8a5fdfd..b38b7164acd5 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -23,10 +20,12 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
+#include "nf_internals.h"
+
static const unsigned int nf_ct_icmp_timeout = 30*HZ;
-static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
+bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
+ struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct icmphdr *hp;
struct icmphdr _hdr;
@@ -54,8 +53,8 @@ static const u_int8_t invmap[] = {
[ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
};
-static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
if (orig->dst.u.icmp.type >= sizeof(invmap) ||
!invmap[orig->dst.u.icmp.type])
@@ -68,11 +67,10 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
}
/* Returns verdict for packet, or -1 for invalid. */
-static int icmp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_icmp_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
@@ -104,53 +102,94 @@ static int icmp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
-static int
-icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb,
- const struct nf_hook_state *state)
+/* Check inner header is related to any of the existing connections */
+int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state,
+ u8 l4proto, union nf_inet_addr *outer_daddr)
{
struct nf_conntrack_tuple innertuple, origtuple;
- const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
+ union nf_inet_addr *ct_daddr;
+ enum ip_conntrack_dir dir;
+ struct nf_conn *ct;
WARN_ON(skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
- if (!nf_ct_get_tuplepr(skb,
- skb_network_offset(skb) + ip_hdrlen(skb)
- + sizeof(struct icmphdr),
- PF_INET, state->net, &origtuple)) {
- pr_debug("icmp_error_message: failed to get tuple\n");
+ if (!nf_ct_get_tuplepr(skb, dataoff,
+ state->pf, state->net, &origtuple))
return -NF_ACCEPT;
- }
-
- /* rcu_read_lock()ed by nf_hook_thresh */
- innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
- pr_debug("icmp_error_message: no match\n");
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple))
return -NF_ACCEPT;
- }
-
- ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(state->net, zone, &innertuple);
- if (!h) {
- pr_debug("icmp_error_message: no match\n");
+ if (!h)
+ return -NF_ACCEPT;
+
+ /* Consider: A -> T (=This machine) -> B
+ * Conntrack entry will look like this:
+ * Original: A->B
+ * Reply: B->T (SNAT case) OR A
+ *
+ * When this function runs, we got packet that looks like this:
+ * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..).
+ *
+ * Above nf_conntrack_find_get() makes lookup based on inner_hdr,
+ * so we should expect that destination of the found connection
+ * matches outer header destination address.
+ *
+ * In above example, we can consider these two cases:
+ * 1. Error coming in reply direction from B or M (middle box) to
+ * T (SNAT case) or A.
+ * Inner saddr will be B, dst will be T or A.
+ * The found conntrack will be reply tuple (B->T/A).
+ * 2. Error coming in original direction from A or M to B.
+ * Inner saddr will be A, inner daddr will be B.
+ * The found conntrack will be original tuple (A->B).
+ *
+ * In both cases, conntrack[dir].dst == inner.dst.
+ *
+ * A bogus packet could look like this:
+ * Inner: B->T
+ * Outer: B->X (other machine reachable by T).
+ *
+ * In this case, lookup yields connection A->B and will
+ * set packet from B->X as *RELATED*, even though no connection
+ * from X was ever seen.
+ */
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ dir = NF_CT_DIRECTION(h);
+ ct_daddr = &ct->tuplehash[dir].tuple.dst.u3;
+ if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) {
+ if (state->pf == AF_INET) {
+ nf_l4proto_log_invalid(skb, state,
+ l4proto,
+ "outer daddr %pI4 != inner %pI4",
+ &outer_daddr->ip, &ct_daddr->ip);
+ } else if (state->pf == AF_INET6) {
+ nf_l4proto_log_invalid(skb, state,
+ l4proto,
+ "outer daddr %pI6 != inner %pI6",
+ &outer_daddr->ip6, &ct_daddr->ip6);
+ }
+ nf_ct_put(ct);
return -NF_ACCEPT;
}
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ ctinfo = IP_CT_RELATED;
+ if (dir == IP_CT_DIR_REPLY)
ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
- nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
+ nf_ct_set(skb, ct, ctinfo);
return NF_ACCEPT;
}
@@ -158,8 +197,7 @@ static void icmp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMP, "%s", msg);
}
/* Small and modified version of icmp_rcv */
@@ -167,20 +205,21 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
const struct nf_hook_state *state)
{
+ union nf_inet_addr outer_daddr;
const struct icmphdr *icmph;
struct icmphdr _ih;
/* Not enough header? */
- icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
+ icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
if (icmph == NULL) {
icmp_error_log(skb, state, "short packet");
return -NF_ACCEPT;
}
- /* See ip_conntrack_proto_tcp.c */
+ /* See nf_conntrack_proto_tcp.c */
if (state->net->ct.sysctl_checksum &&
state->hook == NF_INET_PRE_ROUTING &&
- nf_ip_checksum(skb, state->hook, dataoff, 0)) {
+ nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) {
icmp_error_log(skb, state, "bad hw icmp checksum");
return -NF_ACCEPT;
}
@@ -197,14 +236,15 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
}
/* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_SOURCE_QUENCH &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB &&
- icmph->type != ICMP_REDIRECT)
+ if (!icmp_is_err(icmph->type))
return NF_ACCEPT;
- return icmp_error_message(tmpl, skb, state);
+ memset(&outer_daddr, 0, sizeof(outer_daddr));
+ outer_daddr.ip = ip_hdr(skb)->daddr;
+
+ dataoff += sizeof(*icmph);
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMP, &outer_daddr);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -232,20 +272,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
};
static int icmp_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_ICMP_TYPE] ||
- !tb[CTA_PROTO_ICMP_CODE] ||
- !tb[CTA_PROTO_ICMP_ID])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) {
+ if (!tb[CTA_PROTO_ICMP_TYPE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
+ if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type])
+ return -EINVAL;
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) {
+ if (!tb[CTA_PROTO_ICMP_CODE])
+ return -EINVAL;
- tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
- tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
- tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
+ }
- if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
- !invmap[tuple->dst.u.icmp.type])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) {
+ if (!tb[CTA_PROTO_ICMP_ID])
+ return -EINVAL;
+
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
+ }
return 0;
}
@@ -303,56 +355,16 @@ icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table icmp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_icmp_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
- pn->ctl_table = kmemdup(icmp_sysctl_table,
- sizeof(icmp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &in->timeout;
-#endif
- return 0;
-}
-
-static int icmp_init_net(struct net *net)
+void nf_conntrack_icmp_init_net(struct net *net)
{
struct nf_icmp_net *in = nf_icmp_pernet(net);
- struct nf_proto_net *pn = &in->pn;
in->timeout = nf_ct_icmp_timeout;
-
- return icmp_kmemdup_sysctl_table(pn, in);
-}
-
-static struct nf_proto_net *icmp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.icmp.pn;
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
.l4proto = IPPROTO_ICMP,
- .pkt_to_tuple = icmp_pkt_to_tuple,
- .invert_tuple = icmp_invert_tuple,
- .packet = icmp_packet,
- .destroy = NULL,
- .me = NULL,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmp_tuple_to_nlattr,
.nlattr_tuple_size = icmp_nlattr_tuple_size,
@@ -368,6 +380,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
.nla_policy = icmp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = icmp_init_net,
- .get_net_proto = icmp_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index a15eefb8e317..327b8059025d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C)2003,2004 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Author:
* Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
*/
@@ -25,15 +22,16 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
#include <net/netfilter/nf_log.h>
+#include "nf_internals.h"
+
static const unsigned int nf_ct_icmpv6_timeout = 30*HZ;
-static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct net *net,
- struct nf_conntrack_tuple *tuple)
+bool icmpv6_pkt_to_tuple(const struct sk_buff *skb,
+ unsigned int dataoff,
+ struct net *net,
+ struct nf_conntrack_tuple *tuple)
{
const struct icmp6hdr *hp;
struct icmp6hdr _hdr;
@@ -64,11 +62,13 @@ static const u_int8_t noct_valid_new[] = {
[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
- [ICMPV6_MLD2_REPORT - 130] = 1
+ [ICMPV6_MLD2_REPORT - 130] = 1,
+ [ICMPV6_MRDISC_ADV - 130] = 1,
+ [ICMPV6_MRDISC_SOL - 130] = 1
};
-static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
+bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_tuple *orig)
{
int type = orig->dst.u.icmp.type - 128;
if (type < 0 || type >= sizeof(invmap) || !invmap[type])
@@ -86,11 +86,10 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
}
/* Returns verdict for packet, or -1 for invalid. */
-static int icmpv6_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_icmpv6_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
unsigned int *timeout = nf_ct_timeout_lookup(ct);
static const u8 valid_new[] = {
@@ -124,62 +123,62 @@ static int icmpv6_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static int
-icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
- struct sk_buff *skb,
- unsigned int icmp6off)
+
+static void icmpv6_error_log(const struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ const char *msg)
+{
+ nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg);
+}
+
+static noinline_for_stack int
+nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct nf_hook_state *state)
{
- struct nf_conntrack_tuple intuple, origtuple;
- const struct nf_conntrack_tuple_hash *h;
- const struct nf_conntrack_l4proto *inproto;
- enum ip_conntrack_info ctinfo;
- struct nf_conntrack_zone tmp;
-
- WARN_ON(skb_nfct(skb));
-
- /* Are they talking about one of our connections? */
- if (!nf_ct_get_tuplepr(skb,
- skb_network_offset(skb)
- + sizeof(struct ipv6hdr)
- + sizeof(struct icmp6hdr),
- PF_INET6, net, &origtuple)) {
- pr_debug("icmpv6_error: Can't get tuple\n");
+ u8 hl = ipv6_hdr(skb)->hop_limit;
+ union nf_inet_addr outer_daddr;
+ union {
+ struct nd_opt_hdr nd_opt;
+ struct rd_msg rd_msg;
+ } tmp;
+ const struct nd_opt_hdr *nd_opt;
+ const struct rd_msg *rd_msg;
+
+ rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg);
+ if (!rd_msg) {
+ icmpv6_error_log(skb, state, "short redirect");
return -NF_ACCEPT;
}
- /* rcu_read_lock()ed by nf_hook_thresh */
- inproto = __nf_ct_l4proto_find(origtuple.dst.protonum);
+ if (rd_msg->icmph.icmp6_code != 0)
+ return NF_ACCEPT;
- /* Ordinarily, we'd expect the inverted tupleproto, but it's
- been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) {
- pr_debug("icmpv6_error: Can't invert tuple\n");
+ if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
+ icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect");
return -NF_ACCEPT;
}
- ctinfo = IP_CT_RELATED;
+ dataoff += sizeof(*rd_msg);
- h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
- &intuple);
- if (!h) {
- pr_debug("icmpv6_error: no match\n");
+ /* warning: rd_msg no longer usable after this call */
+ nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt);
+ if (!nd_opt || nd_opt->nd_opt_len == 0) {
+ icmpv6_error_log(skb, state, "redirect without options");
return -NF_ACCEPT;
- } else {
- if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
- ctinfo += IP_CT_IS_REPLY;
}
- /* Update skb to refer to this connection */
- nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
- return NF_ACCEPT;
-}
+ /* We could call ndisc_parse_options(), but it would need
+ * skb_linearize() and a bit more work.
+ */
+ if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR)
+ return NF_ACCEPT;
-static void icmpv6_error_log(const struct sk_buff *skb,
- const struct nf_hook_state *state,
- const char *msg)
-{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_ICMPV6, "%s", msg);
+ memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+ sizeof(outer_daddr.ip6));
+ dataoff += 8;
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMPV6, &outer_daddr);
}
int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
@@ -187,6 +186,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
unsigned int dataoff,
const struct nf_hook_state *state)
{
+ union nf_inet_addr outer_daddr;
const struct icmp6hdr *icmp6h;
struct icmp6hdr _ih;
int type;
@@ -211,11 +211,18 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
return NF_ACCEPT;
}
+ if (icmp6h->icmp6_type == NDISC_REDIRECT)
+ return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state);
+
/* is not error message ? */
if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT;
- return icmpv6_error_message(state->net, tmpl, skb, dataoff);
+ memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
+ sizeof(outer_daddr.ip6));
+ dataoff += sizeof(*icmp6h);
+ return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
+ IPPROTO_ICMPV6, &outer_daddr);
}
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -242,21 +249,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = {
};
static int icmpv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *tuple)
+ struct nf_conntrack_tuple *tuple,
+ u_int32_t flags)
{
- if (!tb[CTA_PROTO_ICMPV6_TYPE] ||
- !tb[CTA_PROTO_ICMPV6_CODE] ||
- !tb[CTA_PROTO_ICMPV6_ID])
- return -EINVAL;
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) {
+ if (!tb[CTA_PROTO_ICMPV6_TYPE])
+ return -EINVAL;
+
+ tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
+ if (tuple->dst.u.icmp.type < 128 ||
+ tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type - 128])
+ return -EINVAL;
+ }
- tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]);
- tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
- tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) {
+ if (!tb[CTA_PROTO_ICMPV6_CODE])
+ return -EINVAL;
- if (tuple->dst.u.icmp.type < 128 ||
- tuple->dst.u.icmp.type - 128 >= sizeof(invmap) ||
- !invmap[tuple->dst.u.icmp.type - 128])
- return -EINVAL;
+ tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]);
+ }
+
+ if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) {
+ if (!tb[CTA_PROTO_ICMPV6_ID])
+ return -EINVAL;
+
+ tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]);
+ }
return 0;
}
@@ -314,54 +333,16 @@ icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table icmpv6_sysctl_table[] = {
- {
- .procname = "nf_conntrack_icmpv6_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_icmp_net *in)
-{
-#ifdef CONFIG_SYSCTL
- pn->ctl_table = kmemdup(icmpv6_sysctl_table,
- sizeof(icmpv6_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &in->timeout;
-#endif
- return 0;
-}
-
-static int icmpv6_init_net(struct net *net)
+void nf_conntrack_icmpv6_init_net(struct net *net)
{
struct nf_icmp_net *in = nf_icmpv6_pernet(net);
- struct nf_proto_net *pn = &in->pn;
in->timeout = nf_ct_icmpv6_timeout;
-
- return icmpv6_kmemdup_sysctl_table(pn, in);
-}
-
-static struct nf_proto_net *icmpv6_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.icmpv6.pn;
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
{
.l4proto = IPPROTO_ICMPV6,
- .pkt_to_tuple = icmpv6_pkt_to_tuple,
- .invert_tuple = icmpv6_invert_tuple,
- .packet = icmpv6_packet,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmpv6_tuple_to_nlattr,
.nlattr_tuple_size = icmpv6_nlattr_tuple_size,
@@ -377,6 +358,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
.nla_policy = icmpv6_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = icmpv6_init_net,
- .get_net_proto = icmpv6_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index d53e3e78f605..7c6f7c9f7332 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Connection tracking protocol helper module for SCTP.
*
@@ -6,10 +7,6 @@
*
* SCTP is defined in RFC 2960. References to various sections in this code
* are to this RFC.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -30,41 +27,31 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
- closely. They're more complex. --RR
-
- And so for me for SCTP :D -Kiran */
-
static const char *const sctp_conntrack_names[] = {
- "NONE",
- "CLOSED",
- "COOKIE_WAIT",
- "COOKIE_ECHOED",
- "ESTABLISHED",
- "SHUTDOWN_SENT",
- "SHUTDOWN_RECD",
- "SHUTDOWN_ACK_SENT",
- "HEARTBEAT_SENT",
- "HEARTBEAT_ACKED",
+ [SCTP_CONNTRACK_NONE] = "NONE",
+ [SCTP_CONNTRACK_CLOSED] = "CLOSED",
+ [SCTP_CONNTRACK_COOKIE_WAIT] = "COOKIE_WAIT",
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = "COOKIE_ECHOED",
+ [SCTP_CONNTRACK_ESTABLISHED] = "ESTABLISHED",
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = "SHUTDOWN_SENT",
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = "SHUTDOWN_RECD",
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = "SHUTDOWN_ACK_SENT",
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT",
};
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
- [SCTP_CONNTRACK_CLOSED] = 10 SECS,
- [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
- [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
- [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS,
- [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000,
- [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS,
+ [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10),
+ [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210),
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30),
};
+#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
+
#define sNO SCTP_CONNTRACK_NONE
#define sCL SCTP_CONNTRACK_CLOSED
#define sCW SCTP_CONNTRACK_COOKIE_WAIT
@@ -74,7 +61,6 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD
#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT
-#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED
#define sIV SCTP_CONNTRACK_MAX
/*
@@ -91,15 +77,12 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als
COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction.
ESTABLISHED - We have seen a COOKIE_ACK in the reply direction.
SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction.
-SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction.
SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
to that of the SHUTDOWN chunk.
CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of
the SHUTDOWN chunk. Connection is closed.
HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow.
-HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to
- that of the HEARTBEAT chunk. Secondary connection is
- established.
*/
/* TODO
@@ -116,33 +99,33 @@ cookie echoed to closed.
static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA},
-/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},
-/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
-/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS},
-/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA},
-/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/
-/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */
-/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */
-/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA},
-/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sCL, sCL, sCW, sCE, sES, sCL, sCL, sSA, sCW},
+/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},
+/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA},
+/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL},/* Can't have Stale cookie*/
+/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL},/* 5.2.4 - Big TODO */
+/* cookie_ack */ {sCL, sCL, sCW, sES, sES, sSS, sSR, sSA, sCL},/* Can't come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL},
+/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
},
{
/* REPLY */
-/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */
-/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */
-/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},
-/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL},
-/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR},
-/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA},
-/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA},
-/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */
-/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA},
-/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA},
-/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA},
-/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA}
+/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS */
+/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV},/* INIT in sCL Big TODO */
+/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV},
+/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV},
+/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV},
+/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV},
+/* cookie_echo */ {sIV, sCL, sCE, sCE, sES, sSS, sSR, sSA, sIV},/* Can't come in reply dir */
+/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV},
+/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS},
+/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sES},
}
};
@@ -154,6 +137,7 @@ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
}
#endif
+/* do_basic_checks ensures sch->length > 0, do not use before */
#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \
for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
(offset) < (skb)->len && \
@@ -164,7 +148,8 @@ for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0; \
static int do_basic_checks(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- unsigned long *map)
+ unsigned long *map,
+ const struct nf_hook_state *state)
{
u_int32_t offset, count;
struct sctp_chunkhdr _sch, *sch;
@@ -173,8 +158,6 @@ static int do_basic_checks(struct nf_conn *ct,
flag = 0;
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
- pr_debug("Chunk Num: %d Type: %d\n", count, sch->type);
-
if (sch->type == SCTP_CID_INIT ||
sch->type == SCTP_CID_INIT_ACK ||
sch->type == SCTP_CID_SHUTDOWN_COMPLETE)
@@ -189,7 +172,9 @@ static int do_basic_checks(struct nf_conn *ct,
sch->type == SCTP_CID_COOKIE_ECHO ||
flag) &&
count != 0) || !sch->length) {
- pr_debug("Basic checks failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "%s failed. chunk num %d, type %d, len %d flag %d\n",
+ __func__, count, sch->type, sch->length, flag);
return 1;
}
@@ -197,7 +182,6 @@ static int do_basic_checks(struct nf_conn *ct,
set_bit(sch->type, map);
}
- pr_debug("Basic checks passed\n");
return count == 0;
}
@@ -207,64 +191,47 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
{
int i;
- pr_debug("Chunk type: %d\n", chunk_type);
-
switch (chunk_type) {
case SCTP_CID_INIT:
- pr_debug("SCTP_CID_INIT\n");
i = 0;
break;
case SCTP_CID_INIT_ACK:
- pr_debug("SCTP_CID_INIT_ACK\n");
i = 1;
break;
case SCTP_CID_ABORT:
- pr_debug("SCTP_CID_ABORT\n");
i = 2;
break;
case SCTP_CID_SHUTDOWN:
- pr_debug("SCTP_CID_SHUTDOWN\n");
i = 3;
break;
case SCTP_CID_SHUTDOWN_ACK:
- pr_debug("SCTP_CID_SHUTDOWN_ACK\n");
i = 4;
break;
case SCTP_CID_ERROR:
- pr_debug("SCTP_CID_ERROR\n");
i = 5;
break;
case SCTP_CID_COOKIE_ECHO:
- pr_debug("SCTP_CID_COOKIE_ECHO\n");
i = 6;
break;
case SCTP_CID_COOKIE_ACK:
- pr_debug("SCTP_CID_COOKIE_ACK\n");
i = 7;
break;
case SCTP_CID_SHUTDOWN_COMPLETE:
- pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n");
i = 8;
break;
case SCTP_CID_HEARTBEAT:
- pr_debug("SCTP_CID_HEARTBEAT");
i = 9;
break;
case SCTP_CID_HEARTBEAT_ACK:
- pr_debug("SCTP_CID_HEARTBEAT_ACK");
i = 10;
break;
default:
/* Other chunks like DATA or SACK do not change the state */
- pr_debug("Unknown chunk type, Will stay in %s\n",
- sctp_conntrack_names[cur_state]);
+ pr_debug("Unknown chunk type %d, Will stay in %s\n",
+ chunk_type, sctp_conntrack_names[cur_state]);
return cur_state;
}
- pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n",
- dir, sctp_conntrack_names[cur_state], chunk_type,
- sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
-
return sctp_conntracks[dir][i][cur_state];
}
@@ -311,7 +278,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
pr_debug("Setting vtag %x for secondary conntrack\n",
sh->vtag);
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag;
- } else {
+ } else if (sch->type == SCTP_CID_SHUTDOWN_ACK) {
/* If it is a shutdown ack OOTB packet, we expect a return
shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
pr_debug("Setting vtag %x for new conn OOTB\n",
@@ -319,7 +286,7 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
}
- ct->proto.sctp.state = new_state;
+ ct->proto.sctp.state = SCTP_CONNTRACK_NONE;
}
return true;
@@ -339,7 +306,7 @@ static bool sctp_error(struct sk_buff *skb,
if (state->hook == NF_INET_PRE_ROUTING &&
state->net->ct.sysctl_checksum &&
skb->ip_summed == CHECKSUM_NONE) {
- if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) {
+ if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) {
logmsg = "nf_ct_sctp: failed to read header ";
goto out_invalid;
}
@@ -352,16 +319,16 @@ static bool sctp_error(struct sk_buff *skb,
}
return false;
out_invalid:
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_SCTP, "%s", logmsg);
return true;
}
/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
-static int sctp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_sctp_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
enum sctp_conntrack new_state, old_state;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -372,6 +339,7 @@ static int sctp_packet(struct nf_conn *ct,
u_int32_t offset, count;
unsigned int *timeouts;
unsigned long map[256 / sizeof(unsigned long)] = { 0 };
+ bool ignore = false;
if (sctp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -380,7 +348,7 @@ static int sctp_packet(struct nf_conn *ct,
if (sh == NULL)
goto out;
- if (do_basic_checks(ct, skb, dataoff, map) != 0)
+ if (do_basic_checks(ct, skb, dataoff, map, state) != 0)
goto out;
if (!nf_ct_is_confirmed(ct)) {
@@ -403,7 +371,9 @@ static int sctp_packet(struct nf_conn *ct,
!test_bit(SCTP_CID_HEARTBEAT, map) &&
!test_bit(SCTP_CID_HEARTBEAT_ACK, map) &&
sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "verification tag check failed %x vs %x for dir %d",
+ sh->vtag, ct->proto.sctp.vtag[dir], dir);
goto out;
}
@@ -412,33 +382,67 @@ static int sctp_packet(struct nf_conn *ct,
for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
/* Special cases of Verification tag check (Sec 8.5.1) */
if (sch->type == SCTP_CID_INIT) {
- /* Sec 8.5.1 (A) */
+ /* (A) vtag MUST be zero */
if (sh->vtag != 0)
goto out_unlock;
} else if (sch->type == SCTP_CID_ABORT) {
- /* Sec 8.5.1 (B) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir])
+ /* (B) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
- /* Sec 8.5.1 (C) */
- if (sh->vtag != ct->proto.sctp.vtag[dir] &&
- sh->vtag != ct->proto.sctp.vtag[!dir] &&
- sch->flags & SCTP_CHUNK_FLAG_T)
+ /* (C) vtag MUST match own vtag if T flag is unset OR
+ * MUST match peer's vtag if T flag is set
+ */
+ if ((!(sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[dir]) ||
+ ((sch->flags & SCTP_CHUNK_FLAG_T) &&
+ sh->vtag != ct->proto.sctp.vtag[!dir]))
goto out_unlock;
} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
- /* Sec 8.5.1 (D) */
+ /* (D) vtag must be same as init_vtag as found in INIT_ACK */
if (sh->vtag != ct->proto.sctp.vtag[dir])
goto out_unlock;
- } else if (sch->type == SCTP_CID_HEARTBEAT ||
- sch->type == SCTP_CID_HEARTBEAT_ACK) {
+ } else if (sch->type == SCTP_CID_COOKIE_ACK) {
+ ct->proto.sctp.init[dir] = 0;
+ ct->proto.sctp.init[!dir] = 0;
+ } else if (sch->type == SCTP_CID_HEARTBEAT) {
+ if (ct->proto.sctp.vtag[dir] == 0) {
+ pr_debug("Setting %d vtag %x for dir %d\n", sch->type, sh->vtag, dir);
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ } else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ ct->proto.sctp.flags |= SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.last_dir = dir;
+ ignore = true;
+ continue;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ }
+ } else if (sch->type == SCTP_CID_HEARTBEAT_ACK) {
if (ct->proto.sctp.vtag[dir] == 0) {
pr_debug("Setting vtag %x for dir %d\n",
sh->vtag, dir);
ct->proto.sctp.vtag[dir] = sh->vtag;
} else if (sh->vtag != ct->proto.sctp.vtag[dir]) {
- pr_debug("Verification tag check failed\n");
- goto out_unlock;
+ if (test_bit(SCTP_CID_DATA, map) || ignore)
+ goto out_unlock;
+
+ if ((ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) == 0 ||
+ ct->proto.sctp.last_dir == dir)
+ goto out_unlock;
+
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
+ ct->proto.sctp.vtag[dir] = sh->vtag;
+ ct->proto.sctp.vtag[!dir] = 0;
+ } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) {
+ ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED;
}
}
@@ -447,46 +451,76 @@ static int sctp_packet(struct nf_conn *ct,
/* Invalid */
if (new_state == SCTP_CONNTRACK_MAX) {
- pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u "
- "conntrack=%u\n",
- dir, sch->type, old_state);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "Invalid, old_state %d, dir %d, type %d",
+ old_state, dir, sch->type);
+
goto out_unlock;
}
/* If it is an INIT or an INIT ACK note down the vtag */
- if (sch->type == SCTP_CID_INIT ||
- sch->type == SCTP_CID_INIT_ACK) {
- struct sctp_inithdr _inithdr, *ih;
+ if (sch->type == SCTP_CID_INIT) {
+ struct sctp_inithdr _ih, *ih;
- ih = skb_header_pointer(skb, offset + sizeof(_sch),
- sizeof(_inithdr), &_inithdr);
- if (ih == NULL)
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
goto out_unlock;
- pr_debug("Setting vtag %x for dir %d\n",
- ih->init_tag, !dir);
+
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir])
+ ct->proto.sctp.init[!dir] = 0;
+ ct->proto.sctp.init[dir] = 1;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
+ ct->proto.sctp.vtag[!dir] = ih->init_tag;
+
+ /* don't renew timeout on init retransmit so
+ * port reuse by client or NAT middlebox cannot
+ * keep entry alive indefinitely (incl. nat info).
+ */
+ if (new_state == SCTP_CONNTRACK_CLOSED &&
+ old_state == SCTP_CONNTRACK_CLOSED &&
+ nf_ct_is_confirmed(ct))
+ ignore = true;
+ } else if (sch->type == SCTP_CID_INIT_ACK) {
+ struct sctp_inithdr _ih, *ih;
+ __be32 vtag;
+
+ ih = skb_header_pointer(skb, offset + sizeof(_sch), sizeof(*ih), &_ih);
+ if (!ih)
+ goto out_unlock;
+
+ vtag = ct->proto.sctp.vtag[!dir];
+ if (!ct->proto.sctp.init[!dir] && vtag && vtag != ih->init_tag)
+ goto out_unlock;
+ /* collision */
+ if (ct->proto.sctp.init[dir] && ct->proto.sctp.init[!dir] &&
+ vtag != ih->init_tag)
+ goto out_unlock;
+
+ pr_debug("Setting vtag %x for dir %d\n", ih->init_tag, !dir);
ct->proto.sctp.vtag[!dir] = ih->init_tag;
}
ct->proto.sctp.state = new_state;
- if (old_state != new_state)
+ if (old_state != new_state) {
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ if (new_state == SCTP_CONNTRACK_ESTABLISHED &&
+ !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ nf_conntrack_event_cache(IPCT_ASSURED, ct);
+ }
}
spin_unlock_bh(&ct->lock);
+ /* allow but do not refresh timeout */
+ if (ignore)
+ return NF_ACCEPT;
+
timeouts = nf_ct_timeout_lookup(ct);
if (!timeouts)
timeouts = nf_sctp_pernet(nf_ct_net(ct))->timeouts;
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
- if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
- dir == IP_CT_DIR_REPLY &&
- new_state == SCTP_CONNTRACK_ESTABLISHED) {
- pr_debug("Setting assured bit\n");
- set_bit(IPS_ASSURED_BIT, &ct->status);
- nf_conntrack_event_cache(IPCT_ASSURED, ct);
- }
-
return NF_ACCEPT;
out_unlock:
@@ -515,24 +549,29 @@ static bool sctp_can_early_drop(const struct nf_conn *ct)
#include <linux/netfilter/nfnetlink_conntrack.h>
static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
spin_lock_bh(&ct->lock);
- nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP);
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state) ||
- nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_ORIGINAL,
ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]) ||
nla_put_be32(skb, CTA_PROTOINFO_SCTP_VTAG_REPLY,
ct->proto.sctp.vtag[IP_CT_DIR_REPLY]))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
@@ -563,8 +602,8 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct)
if (!attr)
return 0;
- err = nla_parse_nested(tb, CTA_PROTOINFO_SCTP_MAX, attr,
- sctp_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_SCTP_MAX, attr,
+ sctp_nla_policy, NULL);
if (err < 0)
return err;
@@ -597,6 +636,9 @@ static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct nf_sctp_net *sn = nf_sctp_pernet(net);
int i;
+ if (!timeouts)
+ timeouts = sn->timeouts;
+
/* set default SCTP timeouts. */
for (i=0; i<SCTP_CONNTRACK_MAX; i++)
timeouts[i] = sn->timeouts[i];
@@ -642,116 +684,18 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-
-#ifdef CONFIG_SYSCTL
-static struct ctl_table sctp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_sctp_timeout_closed",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_cookie_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_cookie_echoed",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_established",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_shutdown_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_shutdown_recd",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_heartbeat_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_sctp_timeout_heartbeat_acked",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif
-
-static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_sctp_net *sn)
-{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table)
- return 0;
-
- pn->ctl_table = kmemdup(sctp_sysctl_table,
- sizeof(sctp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED];
- pn->ctl_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT];
- pn->ctl_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED];
- pn->ctl_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED];
- pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT];
- pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD];
- pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT];
- pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT];
- pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED];
-#endif
- return 0;
-}
-
-static int sctp_init_net(struct net *net)
+void nf_conntrack_sctp_init_net(struct net *net)
{
struct nf_sctp_net *sn = nf_sctp_pernet(net);
- struct nf_proto_net *pn = &sn->pn;
-
- if (!pn->users) {
- int i;
-
- for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
- sn->timeouts[i] = sctp_timeouts[i];
-
- /* timeouts[0] is unused, init it so ->timeouts[0] contains
- * 'new' timeout, like udp or icmp.
- */
- sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED];
- }
+ int i;
- return sctp_kmemdup_sysctl_table(pn, sn);
-}
+ for (i = 0; i < SCTP_CONNTRACK_MAX; i++)
+ sn->timeouts[i] = sctp_timeouts[i];
-static struct nf_proto_net *sctp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.sctp.pn;
+ /* timeouts[0] is unused, init it so ->timeouts[0] contains
+ * 'new' timeout, like udp or icmp.
+ */
+ sn->timeouts[0] = sctp_timeouts[SCTP_CONNTRACK_CLOSED];
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
@@ -759,9 +703,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = sctp_print_conntrack,
#endif
- .packet = sctp_packet,
.can_early_drop = sctp_can_early_drop,
- .me = THIS_MODULE,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_size = SCTP_NLATTR_SIZE,
.to_nlattr = sctp_to_nlattr,
@@ -780,6 +722,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = {
.nla_policy = sctp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = sctp_init_net,
- .get_net_proto = sctp_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 4dcbd51a8e97..0c1d086e96cb 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -17,7 +14,7 @@
#include <linux/skbuff.h>
#include <linux/ipv6.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
@@ -34,20 +31,6 @@
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-/* "Be conservative in what you do,
- be liberal in what you accept from others."
- If it's non-zero, we mark only out of window RST segments as INVALID. */
-static int nf_ct_tcp_be_liberal __read_mostly = 0;
-
-/* If it is set to zero, we disable picking up already established
- connections. */
-static int nf_ct_tcp_loose __read_mostly = 1;
-
-/* Max number of the retransmitted packets without receiving an (acceptable)
- ACK from the destination. If this number is reached, a shorter timer
- will be started. */
-static int nf_ct_tcp_max_retrans __read_mostly = 3;
-
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR */
@@ -64,6 +47,12 @@ static const char *const tcp_conntrack_names[] = {
"SYN_SENT2",
};
+enum nf_ct_tcp_action {
+ NFCT_TCP_IGNORE,
+ NFCT_TCP_INVALID,
+ NFCT_TCP_ACCEPT,
+};
+
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
@@ -355,10 +344,11 @@ static void tcp_options(const struct sk_buff *skb,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
- state->td_scale =
- state->flags = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
while (length > 0) {
int opcode=*ptr++;
@@ -411,7 +401,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
length, buff);
- BUG_ON(ptr == NULL);
+ if (!ptr)
+ return;
/* Fast path for timestamp-only option */
if (length == TCPOLEN_TSTAMP_ALIGNED
@@ -461,29 +452,81 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
}
}
-static bool tcp_in_window(const struct nf_conn *ct,
- struct ip_ct_tcp *state,
- enum ip_conntrack_dir dir,
- unsigned int index,
- const struct sk_buff *skb,
- unsigned int dataoff,
- const struct tcphdr *tcph)
+static void tcp_init_sender(struct ip_ct_tcp_state *sender,
+ struct ip_ct_tcp_state *receiver,
+ const struct sk_buff *skb,
+ unsigned int dataoff,
+ const struct tcphdr *tcph,
+ u32 end, u32 win,
+ enum ip_conntrack_dir dir)
{
- struct net *net = nf_ct_net(ct);
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ /* SYN-ACK in reply to a SYN
+ * or SYN from reply direction in simultaneous open.
+ */
+ sender->td_end =
+ sender->td_maxend = end;
+ sender->td_maxwin = (win == 0 ? 1 : win);
+
+ tcp_options(skb, dataoff, tcph, sender);
+ /* RFC 1323:
+ * Both sides must send the Window Scale option
+ * to enable window scaling in either direction.
+ */
+ if (dir == IP_CT_DIR_REPLY &&
+ !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
+ receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) {
+ sender->td_scale = 0;
+ receiver->td_scale = 0;
+ }
+}
+
+__printf(6, 7)
+static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb,
+ const struct nf_conn *ct,
+ const struct nf_hook_state *state,
+ const struct ip_ct_tcp_state *sender,
+ enum nf_ct_tcp_action ret,
+ const char *fmt, ...)
+{
+ const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct));
+ struct va_format vaf;
+ va_list args;
+ bool be_liberal;
+
+ be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal;
+ if (be_liberal)
+ return NFCT_TCP_ACCEPT;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf);
+ va_end(args);
+
+ return ret;
+}
+
+static enum nf_ct_tcp_action
+tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir,
+ unsigned int index, const struct sk_buff *skb,
+ unsigned int dataoff, const struct tcphdr *tcph,
+ const struct nf_hook_state *hook_state)
+{
+ struct ip_ct_tcp *state = &ct->proto.tcp;
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
- const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
__u32 seq, ack, sack, end, win, swin;
+ bool in_recv_win, seq_ok;
s32 receiver_offset;
- bool res, in_recv_win;
+ u16 win_raw;
/*
* Get the required data from the packet.
*/
seq = ntohl(tcph->seq);
ack = sack = ntohl(tcph->ack_seq);
- win = ntohs(tcph->window);
+ win_raw = ntohs(tcph->window);
+ win = win_raw;
end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
@@ -494,44 +537,17 @@ static bool tcp_in_window(const struct nf_conn *ct,
ack -= receiver_offset;
sack -= receiver_offset;
- pr_debug("tcp_in_window: START\n");
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
-
if (sender->td_maxwin == 0) {
/*
* Initialize sender data.
*/
if (tcph->syn) {
- /*
- * SYN-ACK in reply to a SYN
- * or SYN from reply direction in simultaneous open.
- */
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
-
- tcp_options(skb, dataoff, tcph, sender);
- /*
- * RFC 1323:
- * Both sides must send the Window Scale option
- * to enable window scaling in either direction.
- */
- if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
- && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
- sender->td_scale =
- receiver->td_scale = 0;
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win, dir);
if (!tcph->ack)
/* Simultaneous open */
- return true;
+ return NFCT_TCP_ACCEPT;
} else {
/*
* We are in the middle of a connection,
@@ -542,29 +558,39 @@ static bool tcp_in_window(const struct nf_conn *ct,
swin = win << sender->td_scale;
sender->td_maxwin = (swin == 0 ? 1 : swin);
sender->td_maxend = end + sender->td_maxwin;
- /*
- * We haven't seen traffic in the other direction yet
- * but we have to tweak window tracking to pass III
- * and IV until that happens.
- */
- if (receiver->td_maxwin == 0)
+ if (receiver->td_maxwin == 0) {
+ /* We haven't seen traffic in the other
+ * direction yet but we have to tweak window
+ * tracking to pass III and IV until that
+ * happens.
+ */
receiver->td_end = receiver->td_maxend = sack;
+ } else if (sack == receiver->td_end + 1) {
+ /* Likely a reply to a keepalive.
+ * Needed for III.
+ */
+ receiver->td_end++;
+ }
+
}
- } else if (((state->state == TCP_CONNTRACK_SYN_SENT
- && dir == IP_CT_DIR_ORIGINAL)
- || (state->state == TCP_CONNTRACK_SYN_RECV
- && dir == IP_CT_DIR_REPLY))
- && after(end, sender->td_end)) {
+ } else if (tcph->syn &&
+ after(end, sender->td_end) &&
+ (state->state == TCP_CONNTRACK_SYN_SENT ||
+ state->state == TCP_CONNTRACK_SYN_RECV)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
* not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
+ *
+ * Re-init state for this direction, just like for the first
+ * syn(-ack) reply, it might differ in seq, ack or tcp options.
*/
- sender->td_end =
- sender->td_maxend = end;
- sender->td_maxwin = (win == 0 ? 1 : win);
+ tcp_init_sender(sender, receiver,
+ skb, dataoff, tcph,
+ end, win, dir);
- tcp_options(skb, dataoff, tcph, sender);
+ if (dir == IP_CT_DIR_REPLY && !tcph->ack)
+ return NFCT_TCP_ACCEPT;
}
if (!(tcph->ack)) {
@@ -588,113 +614,166 @@ static bool tcp_in_window(const struct nf_conn *ct,
*/
seq = end = sender->td_end;
- pr_debug("tcp_in_window: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
- seq, ack, receiver_offset, sack, receiver_offset, win, end);
- pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
+ seq_ok = before(seq, sender->td_maxend + 1);
+ if (!seq_ok) {
+ u32 overshot = end - sender->td_maxend + 1;
+ bool ack_ok;
+
+ ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1);
+ in_recv_win = receiver->td_maxwin &&
+ after(end, sender->td_end - receiver->td_maxwin - 1);
+
+ if (in_recv_win &&
+ ack_ok &&
+ overshot <= receiver->td_maxwin &&
+ before(sack, receiver->td_end + 1)) {
+ /* Work around TCPs that send more bytes than allowed by
+ * the receive window.
+ *
+ * If the (marked as invalid) packet is allowed to pass by
+ * the ruleset and the peer acks this data, then its possible
+ * all future packets will trigger 'ACK is over upper bound' check.
+ *
+ * Thus if only the sequence check fails then do update td_end so
+ * possible ACK for this data can update internal state.
+ */
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "%u bytes more than expected", overshot);
+ }
+
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "SEQ is over upper bound %u (over the window of the receiver)",
+ sender->td_maxend + 1);
+ }
+
+ if (!before(sack, receiver->td_end + 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID,
+ "ACK is over upper bound %u (ACKed data not seen yet)",
+ receiver->td_end + 1);
/* Is the ending sequence in the receive window (if available)? */
in_recv_win = !receiver->td_maxwin ||
after(end, sender->td_end - receiver->td_maxwin - 1);
+ if (!in_recv_win)
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "SEQ is under lower bound %u (already ACKed data retransmitted)",
+ sender->td_end - receiver->td_maxwin - 1);
+ if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1))
+ return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE,
+ "ignored ACK under lower bound %u (possible overly delayed)",
+ receiver->td_end - MAXACKWINDOW(sender) - 1);
+
+ /* Take into account window scaling (RFC 1323). */
+ if (!tcph->syn)
+ win <<= sender->td_scale;
+
+ /* Update sender data. */
+ swin = win + (sack - ack);
+ if (sender->td_maxwin < swin)
+ sender->td_maxwin = swin;
+ if (after(end, sender->td_end)) {
+ sender->td_end = end;
+ sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ }
+ if (tcph->ack) {
+ if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+ sender->td_maxack = ack;
+ sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+ } else if (after(ack, sender->td_maxack)) {
+ sender->td_maxack = ack;
+ }
+ }
- pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
- before(seq, sender->td_maxend + 1),
- (in_recv_win ? 1 : 0),
- before(sack, receiver->td_end + 1),
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
+ /* Update receiver data. */
+ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
+ receiver->td_maxwin += end - sender->td_maxend;
+ if (after(sack + win, receiver->td_maxend - 1)) {
+ receiver->td_maxend = sack + win;
+ if (win == 0)
+ receiver->td_maxend++;
+ }
+ if (ack == receiver->td_end)
+ receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+
+ /* Check retransmissions. */
+ if (index == TCP_ACK_SET) {
+ if (state->last_dir == dir &&
+ state->last_seq == seq &&
+ state->last_ack == ack &&
+ state->last_end == end &&
+ state->last_win == win_raw) {
+ state->retrans++;
+ } else {
+ state->last_dir = dir;
+ state->last_seq = seq;
+ state->last_ack = ack;
+ state->last_end = end;
+ state->last_win = win_raw;
+ state->retrans = 0;
+ }
+ }
- if (before(seq, sender->td_maxend + 1) &&
- in_recv_win &&
- before(sack, receiver->td_end + 1) &&
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
- /*
- * Take into account window scaling (RFC 1323).
- */
- if (!tcph->syn)
- win <<= sender->td_scale;
+ return NFCT_TCP_ACCEPT;
+}
- /*
- * Update sender data.
- */
- swin = win + (sack - ack);
- if (sender->td_maxwin < swin)
- sender->td_maxwin = swin;
- if (after(end, sender->td_end)) {
- sender->td_end = end;
- sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
- }
- if (tcph->ack) {
- if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
- sender->td_maxack = ack;
- sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
- } else if (after(ack, sender->td_maxack))
- sender->td_maxack = ack;
- }
+static void __cold nf_tcp_handle_invalid(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ int index,
+ const struct sk_buff *skb,
+ const struct nf_hook_state *hook_state)
+{
+ const unsigned int *timeouts;
+ const struct nf_tcp_net *tn;
+ unsigned int timeout;
+ u32 expires;
- /*
- * Update receiver data.
- */
- if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
- receiver->td_maxwin += end - sender->td_maxend;
- if (after(sack + win, receiver->td_maxend - 1)) {
- receiver->td_maxend = sack + win;
- if (win == 0)
- receiver->td_maxend++;
- }
- if (ack == receiver->td_end)
- receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
+ if (!test_bit(IPS_ASSURED_BIT, &ct->status) ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
+ return;
- /*
- * Check retransmissions.
- */
- if (index == TCP_ACK_SET) {
- if (state->last_dir == dir
- && state->last_seq == seq
- && state->last_ack == ack
- && state->last_end == end
- && state->last_win == win)
- state->retrans++;
- else {
- state->last_dir = dir;
- state->last_seq = seq;
- state->last_ack = ack;
- state->last_end = end;
- state->last_win = win;
- state->retrans = 0;
- }
- }
- res = true;
- } else {
- res = false;
- if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
- tn->tcp_be_liberal)
- res = true;
- if (!res) {
- nf_ct_l4proto_log_invalid(skb, ct,
- "%s",
- before(seq, sender->td_maxend + 1) ?
- in_recv_win ?
- before(sack, receiver->td_end + 1) ?
- after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
- : "ACK is under the lower bound (possible overly delayed ACK)"
- : "ACK is over the upper bound (ACKed data not seen yet)"
- : "SEQ is under the lower bound (already ACKed data retransmitted)"
- : "SEQ is over the upper bound (over the window of the receiver)");
- }
+ /* We don't want to have connections hanging around in ESTABLISHED
+ * state for long time 'just because' conntrack deemed a FIN/RST
+ * out-of-window.
+ *
+ * Shrink the timeout just like when there is unacked data.
+ * This speeds up eviction of 'dead' connections where the
+ * connection and conntracks internal state are out of sync.
+ */
+ switch (index) {
+ case TCP_RST_SET:
+ case TCP_FIN_SET:
+ break;
+ default:
+ return;
}
- pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
- "receiver end=%u maxend=%u maxwin=%u\n",
- res, sender->td_end, sender->td_maxend, sender->td_maxwin,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+ if (ct->proto.tcp.last_dir != dir &&
+ (ct->proto.tcp.last_index == TCP_FIN_SET ||
+ ct->proto.tcp.last_index == TCP_RST_SET)) {
+ expires = nf_ct_expires(ct);
+ if (expires < 120 * HZ)
+ return;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
- return res;
+ timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]);
+ if (expires > timeout) {
+ nf_ct_l4proto_log_invalid(skb, ct, hook_state,
+ "packet (index %d, dir %d) response for index %d lower timeout to %u",
+ index, dir, ct->proto.tcp.last_index, timeout);
+
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+ }
+ } else {
+ ct->proto.tcp.last_index = index;
+ ct->proto.tcp.last_dir = dir;
+ }
}
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
@@ -716,7 +795,7 @@ static void tcp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg);
}
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
@@ -758,20 +837,19 @@ static bool tcp_error(const struct tcphdr *th,
static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff,
- const struct tcphdr *th)
+ const struct tcphdr *th,
+ const struct nf_hook_state *state)
{
enum tcp_conntrack new_state;
struct net *net = nf_ct_net(ct);
const struct nf_tcp_net *tn = nf_tcp_pernet(net);
- const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
- const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
/* Don't need lock here: this conntrack not in circulation yet */
new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
- pr_debug("nf_ct_tcp: invalid new deleting.\n");
+ tcp_error_log(skb, state, "invalid new");
return false;
}
@@ -817,29 +895,82 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* tcp_packet will set them */
ct->proto.tcp.last_index = TCP_NONE_SET;
-
- pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i "
- "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
- __func__,
- sender->td_end, sender->td_maxend, sender->td_maxwin,
- sender->td_scale,
- receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
- receiver->td_scale);
return true;
}
+static bool tcp_can_early_drop(const struct nf_conn *ct)
+{
+ switch (ct->proto.tcp.state) {
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ case TCP_CONNTRACK_CLOSE:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+void nf_conntrack_tcp_set_closing(struct nf_conn *ct)
+{
+ enum tcp_conntrack old_state;
+ const unsigned int *timeouts;
+ u32 timeout;
+
+ if (!nf_ct_is_confirmed(ct))
+ return;
+
+ spin_lock_bh(&ct->lock);
+ old_state = ct->proto.tcp.state;
+ ct->proto.tcp.state = TCP_CONNTRACK_CLOSE;
+
+ if (old_state == TCP_CONNTRACK_CLOSE ||
+ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
+ spin_unlock_bh(&ct->lock);
+ return;
+ }
+
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts) {
+ const struct nf_tcp_net *tn;
+
+ tn = nf_tcp_pernet(nf_ct_net(ct));
+ timeouts = tn->timeouts;
+ }
+
+ timeout = timeouts[TCP_CONNTRACK_CLOSE];
+ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
+
+ spin_unlock_bh(&ct->lock);
+
+ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+}
+
+static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
+{
+ state->td_end = 0;
+ state->td_maxend = 0;
+ state->td_maxwin = 0;
+ state->td_maxack = 0;
+ state->td_scale = 0;
+ state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
+}
+
/* Returns verdict for packet, or -1 for invalid. */
-static int tcp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_tcp_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = nf_tcp_pernet(net);
- struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
+ enum nf_ct_tcp_action res;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
@@ -852,7 +983,7 @@ static int tcp_packet(struct nf_conn *ct,
if (tcp_error(th, skb, dataoff, state))
return -NF_ACCEPT;
- if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th))
+ if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state))
return -NF_ACCEPT;
spin_lock_bh(&ct->lock);
@@ -860,7 +991,6 @@ static int tcp_packet(struct nf_conn *ct,
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
new_state = tcp_conntracks[dir][index][old_state];
- tuple = &ct->tuplehash[dir].tuple;
switch (new_state) {
case TCP_CONNTRACK_SYN_SENT:
@@ -895,7 +1025,7 @@ static int tcp_packet(struct nf_conn *ct,
return -NF_REPEAT;
return NF_DROP;
}
- /* Fall through */
+ fallthrough;
case TCP_CONNTRACK_IGNORE:
/* Ignored packets:
*
@@ -934,8 +1064,7 @@ static int tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
ct->proto.tcp.last_flags;
- memset(&ct->proto.tcp.seen[dir], 0,
- sizeof(struct ip_ct_tcp_state));
+ nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
break;
}
ct->proto.tcp.last_index = index;
@@ -975,9 +1104,18 @@ static int tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_flags |=
IP_CT_EXP_CHALLENGE_ACK;
}
+
+ /* possible challenge ack reply to syn */
+ if (old_state == TCP_CONNTRACK_SYN_SENT &&
+ index == TCP_ACK_SET &&
+ dir == IP_CT_DIR_REPLY)
+ ct->proto.tcp.last_ack = ntohl(th->ack_seq);
+
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
- "state %s ", tcp_conntrack_names[old_state]);
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d ignored, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
/* Special case for SYN proxy: when the SYN to the server or
@@ -996,10 +1134,11 @@ static int tcp_packet(struct nf_conn *ct,
}
/* Invalid packet */
- pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
- dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
+ nf_ct_l4proto_log_invalid(skb, ct, state,
+ "packet (index %d) in dir %d invalid, state %s",
+ index, dir,
+ tcp_conntrack_names[old_state]);
return -NF_ACCEPT;
case TCP_CONNTRACK_TIME_WAIT:
/* RFC5961 compliance cause stack to send "challenge-ACK"
@@ -1014,7 +1153,7 @@ static int tcp_packet(struct nf_conn *ct,
/* Detected RFC5961 challenge ACK */
ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
+ nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored");
return NF_ACCEPT; /* Don't change state */
}
break;
@@ -1030,16 +1169,58 @@ static int tcp_packet(struct nf_conn *ct,
new_state = TCP_CONNTRACK_ESTABLISHED;
break;
case TCP_CONNTRACK_CLOSE:
- if (index == TCP_RST_SET
- && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
- && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
- /* Invalid RST */
- spin_unlock_bh(&ct->lock);
- nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
- return -NF_ACCEPT;
+ if (index != TCP_RST_SET)
+ break;
+
+ /* If we are closing, tuple might have been re-used already.
+ * last_index, last_ack, and all other ct fields used for
+ * sequence/window validation are outdated in that case.
+ *
+ * As the conntrack can already be expired by GC under pressure,
+ * just skip validation checks.
+ */
+ if (tcp_can_early_drop(ct))
+ goto in_window;
+
+ /* td_maxack might be outdated if we let a SYN through earlier */
+ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+ ct->proto.tcp.last_index != TCP_SYN_SET) {
+ u32 seq = ntohl(th->seq);
+
+ /* If we are not in established state and SEQ=0 this is most
+ * likely an answer to a SYN we let go through above (last_index
+ * can be updated due to out-of-order ACKs).
+ */
+ if (seq == 0 && !nf_conntrack_tcp_established(ct))
+ break;
+
+ if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+ !tn->tcp_ignore_invalid_rst) {
+ /* Invalid RST */
+ spin_unlock_bh(&ct->lock);
+ nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
+ return -NF_ACCEPT;
+ }
+
+ if (!nf_conntrack_tcp_established(ct) ||
+ seq == ct->proto.tcp.seen[!dir].td_maxack)
+ break;
+
+ /* Check if rst is part of train, such as
+ * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42
+ * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42
+ */
+ if (ct->proto.tcp.last_index == TCP_ACK_SET &&
+ ct->proto.tcp.last_dir == dir &&
+ seq == ct->proto.tcp.last_end)
+ break;
+
+ /* ... RST sequence number doesn't match exactly, keep
+ * established state to allow a possible challenge ACK.
+ */
+ new_state = old_state;
}
- if (index == TCP_RST_SET
- && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
+ if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
&& ct->proto.tcp.last_index == TCP_SYN_SET)
|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& ct->proto.tcp.last_index == TCP_ACK_SET))
@@ -1055,29 +1236,38 @@ static int tcp_packet(struct nf_conn *ct,
* segments we ignored. */
goto in_window;
}
- /* Just fall through */
+
+ /* Reset in response to a challenge-ack we let through earlier */
+ if (old_state == TCP_CONNTRACK_SYN_SENT &&
+ ct->proto.tcp.last_index == TCP_ACK_SET &&
+ ct->proto.tcp.last_dir == IP_CT_DIR_REPLY &&
+ ntohl(th->seq) == ct->proto.tcp.last_ack)
+ goto in_window;
+
+ break;
default:
/* Keep compilers happy. */
break;
}
- if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
- skb, dataoff, th)) {
+ res = tcp_in_window(ct, dir, index,
+ skb, dataoff, th, state);
+ switch (res) {
+ case NFCT_TCP_IGNORE:
+ spin_unlock_bh(&ct->lock);
+ return NF_ACCEPT;
+ case NFCT_TCP_INVALID:
+ nf_tcp_handle_invalid(ct, dir, index, skb, state);
spin_unlock_bh(&ct->lock);
return -NF_ACCEPT;
+ case NFCT_TCP_ACCEPT:
+ break;
}
in_window:
/* From now on we have got in-window packets */
ct->proto.tcp.last_index = index;
ct->proto.tcp.last_dir = dir;
- pr_debug("tcp_conntracks: ");
- nf_ct_dump_tuple(tuple);
- pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
- (th->syn ? 1 : 0), (th->ack ? 1 : 0),
- (th->fin ? 1 : 0), (th->rst ? 1 : 0),
- old_state, new_state);
-
ct->proto.tcp.state = new_state;
if (old_state != new_state
&& new_state == TCP_CONNTRACK_FIN_WAIT)
@@ -1090,6 +1280,8 @@ static int tcp_packet(struct nf_conn *ct,
if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
timeout = timeouts[TCP_CONNTRACK_RETRANS];
+ else if (unlikely(index == TCP_RST_SET))
+ timeout = timeouts[TCP_CONNTRACK_CLOSE];
else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
@@ -1113,6 +1305,16 @@ static int tcp_packet(struct nf_conn *ct,
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
+
+ if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+ /* do not renew timeout on SYN retransmit.
+ *
+ * Else port reuse by client or NAT middlebox can keep
+ * entry alive indefinitely (including nat info).
+ */
+ return NF_ACCEPT;
+ }
+
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
@@ -1123,7 +1325,7 @@ static int tcp_packet(struct nf_conn *ct,
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
- /* Set ASSURED if we see see valid ack in ESTABLISHED
+ /* Set ASSURED if we see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
set_bit(IPS_ASSURED_BIT, &ct->status);
@@ -1134,40 +1336,29 @@ static int tcp_packet(struct nf_conn *ct,
return NF_ACCEPT;
}
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
- switch (ct->proto.tcp.state) {
- case TCP_CONNTRACK_FIN_WAIT:
- case TCP_CONNTRACK_LAST_ACK:
- case TCP_CONNTRACK_TIME_WAIT:
- case TCP_CONNTRACK_CLOSE:
- case TCP_CONNTRACK_CLOSE_WAIT:
- return true;
- default:
- break;
- }
-
- return false;
-}
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- struct nf_conn *ct)
+ struct nf_conn *ct, bool destroy)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
spin_lock_bh(&ct->lock);
- nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP);
if (!nest_parms)
goto nla_put_failure;
- if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
- nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state))
+ goto nla_put_failure;
+
+ if (destroy)
+ goto skip_state;
+
+ if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
ct->proto.tcp.seen[0].td_scale) ||
nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
ct->proto.tcp.seen[1].td_scale))
@@ -1182,8 +1373,8 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp))
goto nla_put_failure;
+skip_state:
spin_unlock_bh(&ct->lock);
-
nla_nest_end(skb, nest_parms);
return 0;
@@ -1198,7 +1389,7 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
[CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
- [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
+ [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
};
#define TCP_NLATTR_SIZE ( \
@@ -1218,8 +1409,8 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
if (!pattr)
return 0;
- err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr,
- tcp_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr,
+ tcp_nla_policy, NULL);
if (err < 0)
return err;
@@ -1387,146 +1578,42 @@ static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table tcp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_tcp_timeout_syn_sent",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_syn_recv",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_established",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_fin_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_close_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_last_ack",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_time_wait",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_close",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_max_retrans",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_timeout_unacknowledged",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_tcp_loose",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nf_conntrack_tcp_be_liberal",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nf_conntrack_tcp_max_retrans",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_tcp_net *tn)
+void nf_conntrack_tcp_init_net(struct net *net)
{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table)
- return 0;
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ int i;
- pn->ctl_table = kmemdup(tcp_sysctl_table,
- sizeof(tcp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
-
- pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
- pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
- pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
- pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
- pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
- pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
- pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
- pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
- pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
- pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK];
- pn->ctl_table[10].data = &tn->tcp_loose;
- pn->ctl_table[11].data = &tn->tcp_be_liberal;
- pn->ctl_table[12].data = &tn->tcp_max_retrans;
-#endif
- return 0;
-}
+ for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
+ tn->timeouts[i] = tcp_timeouts[i];
-static int tcp_init_net(struct net *net)
-{
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
- struct nf_proto_net *pn = &tn->pn;
+ /* timeouts[0] is unused, make it same as SYN_SENT so
+ * ->timeouts[0] contains 'new' timeout, like udp or icmp.
+ */
+ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
- if (!pn->users) {
- int i;
+ /* If it is set to zero, we disable picking up already established
+ * connections.
+ */
+ tn->tcp_loose = 1;
- for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
- tn->timeouts[i] = tcp_timeouts[i];
+ /* "Be conservative in what you do,
+ * be liberal in what you accept from others."
+ * If it's non-zero, we mark only out of window RST segments as INVALID.
+ */
+ tn->tcp_be_liberal = 0;
- /* timeouts[0] is unused, make it same as SYN_SENT so
- * ->timeouts[0] contains 'new' timeout, like udp or icmp.
- */
- tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
- tn->tcp_loose = nf_ct_tcp_loose;
- tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
- tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
- }
+ /* If it's non-zero, we turn off RST sequence number check */
+ tn->tcp_ignore_invalid_rst = 0;
- return tcp_kmemdup_sysctl_table(pn, tn);
-}
+ /* Max number of the retransmitted packets without receiving an (acceptable)
+ * ACK from the destination. If this number is reached, a shorter timer
+ * will be started.
+ */
+ tn->tcp_max_retrans = 3;
-static struct nf_proto_net *tcp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.tcp.pn;
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ tn->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
@@ -1535,7 +1622,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = tcp_print_conntrack,
#endif
- .packet = tcp_packet,
.can_early_drop = tcp_can_early_drop,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.to_nlattr = tcp_to_nlattr,
@@ -1556,6 +1642,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
.nla_policy = tcp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = tcp_init_net,
- .get_net_proto = tcp_get_net_proto,
};
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index b4f5d5e82031..0030fbe8885c 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -41,8 +38,7 @@ static void udp_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDP, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDP, "%s", msg);
}
static bool udp_error(struct sk_buff *skb,
@@ -85,13 +81,14 @@ static bool udp_error(struct sk_buff *skb,
}
/* Returns verdict for packet, and may modify conntracktype */
-static int udp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_udp_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
unsigned int *timeouts;
+ unsigned long status;
if (udp_error(skb, dataoff, state))
return -NF_ACCEPT;
@@ -100,27 +97,34 @@ static int udp_packet(struct nf_conn *ct,
if (!timeouts)
timeouts = udp_get_timeouts(nf_ct_net(ct));
- if (!nf_ct_is_confirmed(ct))
+ status = READ_ONCE(ct->status);
+ if ((status & IPS_CONFIRMED) == 0)
ct->proto.udp.stream_ts = 2 * HZ + jiffies;
/* If we've seen traffic both ways, this is some kind of UDP
* stream. Set Assured.
*/
- if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+ if (status & IPS_SEEN_REPLY) {
unsigned long extra = timeouts[UDP_CT_UNREPLIED];
+ bool stream = false;
/* Still active after two seconds? Extend timeout. */
- if (time_after(jiffies, ct->proto.udp.stream_ts))
+ if (time_after(jiffies, ct->proto.udp.stream_ts)) {
extra = timeouts[UDP_CT_REPLIED];
+ stream = (status & IPS_ASSURED) == 0;
+ }
nf_ct_refresh_acct(ct, ctinfo, skb, extra);
+ /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
+ if (unlikely((status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
- if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
+ if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -130,8 +134,7 @@ static void udplite_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state,
const char *msg)
{
- nf_l4proto_log_invalid(skb, state->net, state->pf,
- IPPROTO_UDPLITE, "%s", msg);
+ nf_l4proto_log_invalid(skb, state, IPPROTO_UDPLITE, "%s", msg);
}
static bool udplite_error(struct sk_buff *skb,
@@ -177,11 +180,11 @@ static bool udplite_error(struct sk_buff *skb,
}
/* Returns verdict for packet, and may modify conntracktype */
-static int udplite_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state)
+int nf_conntrack_udplite_packet(struct nf_conn *ct,
+ struct sk_buff *skb,
+ unsigned int dataoff,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_hook_state *state)
{
unsigned int *timeouts;
@@ -197,12 +200,15 @@ static int udplite_packet(struct nf_conn *ct,
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
nf_ct_refresh_acct(ct, ctinfo, skb,
timeouts[UDP_CT_REPLIED]);
+
+ if (unlikely((ct->status & IPS_NAT_CLASH)))
+ return NF_ACCEPT;
+
/* Also, more likely to be important, and not a probe */
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -260,66 +266,23 @@ udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = {
};
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-#ifdef CONFIG_SYSCTL
-static struct ctl_table udp_sysctl_table[] = {
- {
- .procname = "nf_conntrack_udp_timeout",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "nf_conntrack_udp_timeout_stream",
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- { }
-};
-#endif /* CONFIG_SYSCTL */
-
-static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn,
- struct nf_udp_net *un)
-{
-#ifdef CONFIG_SYSCTL
- if (pn->ctl_table)
- return 0;
- pn->ctl_table = kmemdup(udp_sysctl_table,
- sizeof(udp_sysctl_table),
- GFP_KERNEL);
- if (!pn->ctl_table)
- return -ENOMEM;
- pn->ctl_table[0].data = &un->timeouts[UDP_CT_UNREPLIED];
- pn->ctl_table[1].data = &un->timeouts[UDP_CT_REPLIED];
-#endif
- return 0;
-}
-
-static int udp_init_net(struct net *net)
+void nf_conntrack_udp_init_net(struct net *net)
{
struct nf_udp_net *un = nf_udp_pernet(net);
- struct nf_proto_net *pn = &un->pn;
+ int i;
- if (!pn->users) {
- int i;
+ for (i = 0; i < UDP_CT_MAX; i++)
+ un->timeouts[i] = udp_timeouts[i];
- for (i = 0; i < UDP_CT_MAX; i++)
- un->timeouts[i] = udp_timeouts[i];
- }
-
- return udp_kmemdup_sysctl_table(pn, un);
-}
-
-static struct nf_proto_net *udp_get_net_proto(struct net *net)
-{
- return &net->ct.nf_ct_proto.udp.pn;
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ un->offload_timeout = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
{
.l4proto = IPPROTO_UDP,
.allow_clash = true,
- .packet = udp_packet,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
@@ -335,8 +298,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
.nla_policy = udp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = udp_init_net,
- .get_net_proto = udp_get_net_proto,
};
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
@@ -344,7 +305,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite =
{
.l4proto = IPPROTO_UDPLITE,
.allow_clash = true,
- .packet = udplite_packet,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
.nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
@@ -360,7 +320,5 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite =
.nla_policy = udp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
- .init_net = udp_init_net,
- .get_net_proto = udp_get_net_proto,
};
#endif
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 5072ff96ab33..13dc421fc4f5 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* SANE connection tracking helper
* (SANE = Scanner Access Now Easy)
* For documentation about the SANE network protocol see
@@ -11,10 +12,6 @@
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
* (C) 2003 Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,14 +27,12 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_sane.h>
+#define HELPER_NAME "sane"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>");
MODULE_DESCRIPTION("SANE connection tracking helper");
-MODULE_ALIAS_NFCT_HELPER("sane");
-
-static char *sane_buffer;
-
-static DEFINE_SPINLOCK(nf_sane_lock);
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
#define MAX_PORTS 8
static u_int16_t ports[MAX_PORTS];
@@ -68,14 +63,16 @@ static int help(struct sk_buff *skb,
unsigned int dataoff, datalen;
const struct tcphdr *th;
struct tcphdr _tcph;
- void *sb_ptr;
int ret = NF_ACCEPT;
int dir = CTINFO2DIR(ctinfo);
struct nf_ct_sane_master *ct_sane_info = nfct_help_data(ct);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple *tuple;
- struct sane_request *req;
struct sane_reply_net_start *reply;
+ union {
+ struct sane_request req;
+ struct sane_reply_net_start repl;
+ } buf;
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
@@ -93,56 +90,62 @@ static int help(struct sk_buff *skb,
return NF_ACCEPT;
datalen = skb->len - dataoff;
-
- spin_lock_bh(&nf_sane_lock);
- sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer);
- BUG_ON(sb_ptr == NULL);
-
if (dir == IP_CT_DIR_ORIGINAL) {
+ const struct sane_request *req;
+
if (datalen != sizeof(struct sane_request))
- goto out;
+ return NF_ACCEPT;
+
+ req = skb_header_pointer(skb, dataoff, datalen, &buf.req);
+ if (!req)
+ return NF_ACCEPT;
- req = sb_ptr;
if (req->RPC_code != htonl(SANE_NET_START)) {
/* Not an interesting command */
- ct_sane_info->state = SANE_STATE_NORMAL;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
+ return NF_ACCEPT;
}
/* We're interested in the next reply */
- ct_sane_info->state = SANE_STATE_START_REQUESTED;
- goto out;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_START_REQUESTED);
+ return NF_ACCEPT;
}
+ /* IP_CT_DIR_REPLY */
+
/* Is it a reply to an uninteresting command? */
- if (ct_sane_info->state != SANE_STATE_START_REQUESTED)
- goto out;
+ if (READ_ONCE(ct_sane_info->state) != SANE_STATE_START_REQUESTED)
+ return NF_ACCEPT;
/* It's a reply to SANE_NET_START. */
- ct_sane_info->state = SANE_STATE_NORMAL;
+ WRITE_ONCE(ct_sane_info->state, SANE_STATE_NORMAL);
if (datalen < sizeof(struct sane_reply_net_start)) {
pr_debug("NET_START reply too short\n");
- goto out;
+ return NF_ACCEPT;
}
- reply = sb_ptr;
+ datalen = sizeof(struct sane_reply_net_start);
+
+ reply = skb_header_pointer(skb, dataoff, datalen, &buf.repl);
+ if (!reply)
+ return NF_ACCEPT;
+
if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */
pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status));
- goto out;
+ return NF_ACCEPT;
}
/* Invalid saned reply? Ignore it. */
if (reply->zero != 0)
- goto out;
+ return NF_ACCEPT;
exp = nf_ct_expect_alloc(ct);
if (exp == NULL) {
nf_ct_helper_log(skb, ct, "cannot alloc expectation");
- ret = NF_DROP;
- goto out;
+ return NF_DROP;
}
tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
@@ -154,15 +157,12 @@ static int help(struct sk_buff *skb,
nf_ct_dump_tuple(&exp->tuple);
/* Can't expect this? Best to drop packet now. */
- if (nf_ct_expect_related(exp) != 0) {
+ if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
}
nf_ct_expect_put(exp);
-
-out:
- spin_unlock_bh(&nf_sane_lock);
return ret;
}
@@ -176,7 +176,6 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = {
static void __exit nf_conntrack_sane_fini(void)
{
nf_conntrack_helpers_unregister(sane, ports_c * 2);
- kfree(sane_buffer);
}
static int __init nf_conntrack_sane_init(void)
@@ -185,22 +184,18 @@ static int __init nf_conntrack_sane_init(void)
NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master));
- sane_buffer = kmalloc(65536, GFP_KERNEL);
- if (!sane_buffer)
- return -ENOMEM;
-
if (ports_c == 0)
ports[ports_c++] = SANE_PORT;
/* FIXME should be configurable whether IPv4 and IPv6 connections
are tracked or not - YK */
for (i = 0; i < ports_c; i++) {
- nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane",
- SANE_PORT, ports[i], ports[i],
+ nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP,
+ HELPER_NAME, SANE_PORT, ports[i], ports[i],
&sane_exp_policy, 0, help, NULL,
THIS_MODULE);
- nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane",
- SANE_PORT, ports[i], ports[i],
+ nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP,
+ HELPER_NAME, SANE_PORT, ports[i], ports[i],
&sane_exp_policy, 0, help, NULL,
THIS_MODULE);
}
@@ -208,7 +203,6 @@ static int __init nf_conntrack_sane_init(void)
ret = nf_conntrack_helpers_register(sane, ports_c * 2);
if (ret < 0) {
pr_err("failed to register helpers\n");
- kfree(sane_buffer);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 9da303461069..7ab2b25b57bc 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/types.h>
#include <linux/netfilter.h>
#include <net/tcp.h>
@@ -125,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
tcph = (void *)skb->data + protoff;
@@ -175,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
- if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, protoff + sizeof(*tcph)))
return 0;
tcph = (void *)skb->data + protoff;
@@ -231,19 +232,3 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct,
this_way->offset_after : this_way->offset_before;
}
EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
-
-static const struct nf_ct_ext_type nf_ct_seqadj_extend = {
- .len = sizeof(struct nf_conn_seqadj),
- .align = __alignof__(struct nf_conn_seqadj),
- .id = NF_CT_EXT_SEQADJ,
-};
-
-int nf_conntrack_seqadj_init(void)
-{
- return nf_ct_extend_register(&nf_ct_seqadj_extend);
-}
-
-void nf_conntrack_seqadj_fini(void)
-{
- nf_ct_extend_unregister(&nf_ct_seqadj_extend);
-}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index c8d2b6688a2a..ca748f8dbff1 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* SIP extension for IP connection tracking.
*
* (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
* based on RR's ip_conntrack_ftp.c and other modules.
* (C) 2007 United Security Providers
* (C) 2007, 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -20,6 +17,8 @@
#include <linux/udp.h>
#include <linux/tcp.h>
#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -28,11 +27,13 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_sip.h>
+#define HELPER_NAME "sip"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
MODULE_DESCRIPTION("SIP connection tracking helper");
MODULE_ALIAS("ip_conntrack_sip");
-MODULE_ALIAS_NFCT_HELPER("sip");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
#define MAX_PORTS 8
static unsigned short ports[MAX_PORTS];
@@ -54,7 +55,12 @@ module_param(sip_direct_media, int, 0600);
MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling "
"endpoints only (default 1)");
-const struct nf_nat_sip_hooks *nf_nat_sip_hooks;
+static int sip_external_media __read_mostly = 0;
+module_param(sip_external_media, int, 0600);
+MODULE_PARM_DESC(sip_external_media, "Expect Media streams between external "
+ "endpoints (default 0)");
+
+const struct nf_nat_sip_hooks __rcu *nf_nat_sip_hooks;
EXPORT_SYMBOL_GPL(nf_nat_sip_hooks);
static int string_len(const struct nf_conn *ct, const char *dptr,
@@ -471,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
return ret;
if (ret == 0)
break;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
*in_header = 0;
}
@@ -483,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr,
break;
if (ret == 0)
return ret;
- dataoff += *matchoff;
+ dataoff = *matchoff;
}
if (in_header)
@@ -605,7 +611,7 @@ int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr,
start += strlen(name);
*val = simple_strtoul(start, &end, 0);
if (start == end)
- return 0;
+ return -1;
if (matchoff && matchlen) {
*matchoff = start - dptr;
*matchlen = end - start;
@@ -861,6 +867,36 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3))
return NF_ACCEPT;
saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ } else if (sip_external_media) {
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct net *net = dev_net(dev);
+ struct flowi fl;
+ struct dst_entry *dst = NULL;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = daddr->ip;
+ nf_ip_route(net, &dst, &fl, false);
+ break;
+
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = daddr->in6;
+ nf_ip6_route(net, &dst, &fl, false);
+ break;
+ }
+
+ /* Don't predict any conntracks when media endpoint is reachable
+ * through the same interface as the signalling peer.
+ */
+ if (dst) {
+ bool external_media = (dst->dev == dev);
+
+ dst_release(dst);
+ if (external_media)
+ return NF_ACCEPT;
+ }
}
/* We need to check whether the registration exists before attempting
@@ -891,7 +927,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
nfct_help(exp->master)->helper != nfct_help(ct)->helper ||
exp->class != class)
break;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (!direct_rtp &&
(!nf_inet_addr_cmp(&exp->saved_addr, &exp->tuple.dst.u3) ||
exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) &&
@@ -941,11 +977,15 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
/* -EALREADY handling works around end-points that send
* SDP messages with identical port but different media type,
* we pretend expectation was set up.
+ * It also works in the case that SDP messages are sent with
+ * identical expect tuples but for different master conntracks.
*/
- int errp = nf_ct_expect_related(rtp_exp);
+ int errp = nf_ct_expect_related(rtp_exp,
+ NF_CT_EXP_F_SKIP_MASTER);
if (errp == 0 || errp == -EALREADY) {
- int errcp = nf_ct_expect_related(rtcp_exp);
+ int errcp = nf_ct_expect_related(rtcp_exp,
+ NF_CT_EXP_F_SKIP_MASTER);
if (errcp == 0 || errcp == -EALREADY)
ret = NF_ACCEPT;
@@ -1189,6 +1229,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
struct nf_conntrack_expect *exp;
union nf_inet_addr *saddr, daddr;
const struct nf_nat_sip_hooks *hooks;
+ struct nf_conntrack_helper *helper;
__be16 port;
u8 proto;
unsigned int expires = 0;
@@ -1249,10 +1290,14 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
if (sip_direct_signalling)
saddr = &ct->tuplehash[!dir].tuple.src.u3;
+ helper = rcu_dereference(nfct_help(ct)->helper);
+ if (!helper)
+ return NF_DROP;
+
nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct),
saddr, &daddr, proto, NULL, &port);
exp->timeout.expires = sip_timeout * HZ;
- exp->helper = nfct_help(ct)->helper;
+ exp->helper = helper;
exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE;
hooks = rcu_dereference(nf_nat_sip_hooks);
@@ -1260,7 +1305,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
ret = hooks->expect(skb, protoff, dataoff, dptr, datalen,
exp, matchoff, matchlen);
else {
- if (nf_ct_expect_related(exp) != 0) {
+ if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
} else
@@ -1508,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1579,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1632,21 +1677,21 @@ static int __init nf_conntrack_sip_init(void)
ports[ports_c++] = SIP_PORT;
for (i = 0; i < ports_c; i++) {
- nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip",
- SIP_PORT, ports[i], i, sip_exp_policy,
- SIP_EXPECT_MAX, sip_help_udp,
+ nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP,
+ HELPER_NAME, SIP_PORT, ports[i], i,
+ sip_exp_policy, SIP_EXPECT_MAX, sip_help_udp,
NULL, THIS_MODULE);
- nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip",
- SIP_PORT, ports[i], i, sip_exp_policy,
- SIP_EXPECT_MAX, sip_help_tcp,
+ nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP,
+ HELPER_NAME, SIP_PORT, ports[i], i,
+ sip_exp_policy, SIP_EXPECT_MAX, sip_help_tcp,
NULL, THIS_MODULE);
- nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip",
- SIP_PORT, ports[i], i, sip_exp_policy,
- SIP_EXPECT_MAX, sip_help_udp,
+ nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP,
+ HELPER_NAME, SIP_PORT, ports[i], i,
+ sip_exp_policy, SIP_EXPECT_MAX, sip_help_udp,
NULL, THIS_MODULE);
- nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip",
- SIP_PORT, ports[i], i, sip_exp_policy,
- SIP_EXPECT_MAX, sip_help_tcp,
+ nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP,
+ HELPER_NAME, SIP_PORT, ports[i], i,
+ sip_exp_policy, SIP_EXPECT_MAX, sip_help_tcp,
NULL, THIS_MODULE);
}
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
index b8e0a22ca1a9..daacf2023fa5 100644
--- a/net/netfilter/nf_conntrack_snmp.c
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SNMP service broadcast connection tracking helper
*
* (c) 2011 Jiri Olsa <jolsa@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b6177fd73304..207b240b14e5 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -14,6 +14,7 @@
#include <linux/sysctl.h>
#endif
+#include <net/netfilter/nf_log.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
@@ -24,6 +25,10 @@
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <linux/rculist_nulls.h>
+static bool enable_hooks __read_mostly;
+MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks");
+module_param(enable_hooks, bool, 0000);
+
unsigned int nf_conntrack_net_id __read_mostly;
#ifdef CONFIG_NF_CONNTRACK_PROCFS
@@ -56,18 +61,13 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
ntohs(tuple->src.u.tcp.port),
ntohs(tuple->dst.u.tcp.port));
break;
- case IPPROTO_UDPLITE: /* fallthrough */
+ case IPPROTO_UDPLITE:
case IPPROTO_UDP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.udp.port),
ntohs(tuple->dst.u.udp.port));
break;
- case IPPROTO_DCCP:
- seq_printf(s, "sport=%hu dport=%hu ",
- ntohs(tuple->src.u.dccp.port),
- ntohs(tuple->dst.u.dccp.port));
- break;
case IPPROTO_SCTP:
seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.sctp.port),
@@ -94,69 +94,87 @@ struct ct_iter_state {
struct seq_net_private p;
struct hlist_nulls_head *hash;
unsigned int htable_size;
+ unsigned int skip_elems;
unsigned int bucket;
u_int64_t time_now;
};
-static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
+static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net,
+ struct ct_iter_state *st)
{
- struct ct_iter_state *st = seq->private;
+ struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
+ unsigned int i;
- for (st->bucket = 0;
- st->bucket < st->htable_size;
- st->bucket++) {
- n = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- if (!is_a_nulls(n))
- return n;
- }
- return NULL;
-}
+ for (i = st->bucket; i < st->htable_size; i++) {
+ unsigned int skip = 0;
-static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
- struct hlist_nulls_node *head)
-{
- struct ct_iter_state *st = seq->private;
+restart:
+ hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ struct hlist_nulls_node *tmp = n;
+
+ if (!net_eq(net, nf_ct_net(ct)))
+ continue;
+
+ if (++skip <= st->skip_elems)
+ continue;
- head = rcu_dereference(hlist_nulls_next_rcu(head));
- while (is_a_nulls(head)) {
- if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= st->htable_size)
- return NULL;
+ /* h should be returned, skip to nulls marker. */
+ while (!is_a_nulls(tmp))
+ tmp = rcu_dereference(hlist_nulls_next_rcu(tmp));
+
+ /* check if h is still linked to hash[i] */
+ if (get_nulls_value(tmp) != i) {
+ skip = 0;
+ goto restart;
+ }
+
+ st->skip_elems = skip;
+ st->bucket = i;
+ return h;
}
- head = rcu_dereference(
- hlist_nulls_first_rcu(&st->hash[st->bucket]));
- }
- return head;
-}
-static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct hlist_nulls_node *head = ct_get_first(seq);
+ skip = 0;
+ if (get_nulls_value(n) != i)
+ goto restart;
+
+ st->skip_elems = 0;
+ }
- if (head)
- while (pos && (head = ct_get_next(seq, head)))
- pos--;
- return pos ? NULL : head;
+ st->bucket = i;
+ return NULL;
}
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
struct ct_iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
st->time_now = ktime_get_real_ns();
rcu_read_lock();
nf_conntrack_get_ht(&st->hash, &st->htable_size);
- return ct_get_idx(seq, *pos);
+
+ if (*pos == 0) {
+ st->skip_elems = 0;
+ st->bucket = 0;
+ } else if (st->skip_elems) {
+ /* resume from last dumped entry */
+ st->skip_elems--;
+ }
+
+ return ct_get_next(net, st);
}
static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct ct_iter_state *st = s->private;
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- return ct_get_next(s, v);
+ return ct_get_next(net, st);
}
static void ct_seq_stop(struct seq_file *s, void *v)
@@ -168,17 +186,16 @@ static void ct_seq_stop(struct seq_file *s, void *v)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
+ struct lsm_context ctx;
int ret;
- u32 len;
- char *secctx;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return;
- seq_printf(s, "secctx=%s ", secctx);
+ seq_printf(s, "secctx=%s ", ctx.context);
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
}
#else
static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
@@ -258,16 +275,16 @@ static const char* l4proto_name(u16 proto)
case IPPROTO_ICMP: return "icmp";
case IPPROTO_TCP: return "tcp";
case IPPROTO_UDP: return "udp";
- case IPPROTO_DCCP: return "dccp";
case IPPROTO_GRE: return "gre";
case IPPROTO_SCTP: return "sctp";
case IPPROTO_UDPLITE: return "udplite";
+ case IPPROTO_ICMPV6: return "icmpv6";
}
return "unknown";
}
-static unsigned int
+static void
seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
{
struct nf_conn_acct *acct;
@@ -275,14 +292,12 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir)
acct = nf_conn_acct_find(ct);
if (!acct)
- return 0;
+ return;
counter = acct->counter;
seq_printf(s, "packets=%llu bytes=%llu ",
(unsigned long long)atomic64_read(&counter[dir].packets),
(unsigned long long)atomic64_read(&counter[dir].bytes));
-
- return 0;
}
/* return 0 on success, 1 in case of error */
@@ -295,10 +310,16 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
WARN_ON(!ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use)))
return 0;
+ /* load ->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
if (nf_ct_should_gc(ct)) {
+ struct ct_iter_state *st = s->private;
+
+ st->skip_elems--;
nf_ct_kill(ct);
goto release;
}
@@ -310,8 +331,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (!net_eq(nf_ct_net(ct), net))
goto release;
- l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct));
- WARN_ON(!l4proto);
+ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
ret = -ENOSPC;
seq_printf(s, "%-8s %u %-8s %u ",
@@ -332,8 +352,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (seq_has_overflowed(s))
goto release;
- if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL);
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
seq_puts(s, "[UNREPLIED] ");
@@ -342,10 +361,11 @@ static int ct_seq_show(struct seq_file *s, void *v)
ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
- if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
- goto release;
+ seq_print_acct(s, ct, IP_CT_DIR_REPLY);
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status))
+ seq_puts(s, "[HW_OFFLOAD] ");
+ else if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
seq_puts(s, "[OFFLOAD] ");
else if (test_bit(IPS_ASSURED_BIT, &ct->status))
seq_puts(s, "[ASSURED] ");
@@ -354,14 +374,14 @@ static int ct_seq_show(struct seq_file *s, void *v)
goto release;
#if defined(CONFIG_NF_CONNTRACK_MARK)
- seq_printf(s, "mark=%u ", ct->mark);
+ seq_printf(s, "mark=%u ", READ_ONCE(ct->mark));
#endif
ct_show_secctx(s, ct);
ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR);
ct_show_delta_time(s, ct);
- seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use));
+ seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use));
if (seq_has_overflowed(s))
goto release;
@@ -408,7 +428,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu + 1;
return per_cpu_ptr(net->ct.stat, cpu);
}
-
+ (*pos)++;
return NULL;
}
@@ -419,24 +439,26 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
static int ct_cpu_seq_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_net(seq);
- unsigned int nr_conntracks = atomic_read(&net->ct.count);
const struct ip_conntrack_stat *st = v;
+ unsigned int nr_conntracks;
if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
+ seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
+ nr_conntracks = nf_conntrack_count(net);
+
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
- 0,
+ st->clash_resolve,
st->found,
0,
st->invalid,
- st->ignore,
0,
0,
+ st->chaintoolong,
st->insert,
st->insert_failed,
st->drop,
@@ -502,22 +524,29 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
}
#endif /* CONFIG_NF_CONNTRACK_PROCFS */
+u32 nf_conntrack_count(const struct net *net)
+{
+ const struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return atomic_read(&cnet->count);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_count);
+
/* Sysctl support */
#ifdef CONFIG_SYSCTL
-/* Log invalid packets of a given protocol */
-static int log_invalid_proto_min __read_mostly;
-static int log_invalid_proto_max __read_mostly = 255;
-
/* size the user *wants to set */
static unsigned int nf_conntrack_htable_size_user __read_mostly;
static int
-nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+nf_conntrack_hash_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
+ /* module_param hashsize could have changed value */
+ nf_conntrack_htable_size_user = nf_conntrack_htable_size;
+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret < 0 || !write)
return ret;
@@ -530,6 +559,29 @@ nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
return ret;
}
+static int
+nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, i;
+
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (ret < 0 || !write)
+ return ret;
+
+ if (*(u8 *)table->data == 0)
+ return 0;
+
+ /* Load nf_log_syslog only if no logger is currently registered */
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ if (nf_log_is_registered(i))
+ return 0;
+ }
+ request_module("%s", "nf_log_syslog");
+
+ return 0;
+}
+
static struct ctl_table_header *nf_ct_netfilter_header;
enum nf_ct_sysctl_index {
@@ -540,13 +592,53 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_LOG_INVALID,
NF_SYSCTL_CT_EXPECT_MAX,
NF_SYSCTL_CT_ACCT,
- NF_SYSCTL_CT_HELPER,
#ifdef CONFIG_NF_CONNTRACK_EVENTS
NF_SYSCTL_CT_EVENTS,
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
NF_SYSCTL_CT_TIMESTAMP,
#endif
+ NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
+#endif
+ NF_SYSCTL_CT_PROTO_TCP_LOOSE,
+ NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+ NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
+ NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
+#endif
+ NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
+#endif
+
+ NF_SYSCTL_CT_LAST_SYSCTL,
};
static struct ctl_table nf_ct_sysctl_table[] = {
@@ -555,11 +647,12 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_COUNT] = {
.procname = "nf_conntrack_count",
- .data = &init_net.ct.count,
.maxlen = sizeof(int),
.mode = 0444,
.proc_handler = proc_dointvec,
@@ -574,59 +667,259 @@ static struct ctl_table nf_ct_sysctl_table[] = {
[NF_SYSCTL_CT_CHECKSUM] = {
.procname = "nf_conntrack_checksum",
.data = &init_net.ct.sysctl_checksum,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
[NF_SYSCTL_CT_LOG_INVALID] = {
.procname = "nf_conntrack_log_invalid",
.data = &init_net.ct.sysctl_log_invalid,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &log_invalid_proto_min,
- .extra2 = &log_invalid_proto_max,
+ .proc_handler = nf_conntrack_log_invalid_sysctl,
},
[NF_SYSCTL_CT_EXPECT_MAX] = {
.procname = "nf_conntrack_expect_max",
.data = &nf_ct_expect_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
[NF_SYSCTL_CT_ACCT] = {
.procname = "nf_conntrack_acct",
.data = &init_net.ct.sysctl_acct,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- [NF_SYSCTL_CT_HELPER] = {
- .procname = "nf_conntrack_helper",
- .data = &init_net.ct.sysctl_auto_assign_helper,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_NF_CONNTRACK_EVENTS
[NF_SYSCTL_CT_EVENTS] = {
.procname = "nf_conntrack_events",
.data = &init_net.ct.sysctl_events,
- .maxlen = sizeof(unsigned int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
[NF_SYSCTL_CT_TIMESTAMP] = {
.procname = "nf_conntrack_timestamp",
.data = &init_net.ct.sysctl_tstamp,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC] = {
+ .procname = "nf_conntrack_generic_timeout",
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_SENT] = {
+ .procname = "nf_conntrack_tcp_timeout_syn_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_SYN_RECV] = {
+ .procname = "nf_conntrack_tcp_timeout_syn_recv",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ESTABLISHED] = {
+ .procname = "nf_conntrack_tcp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_FIN_WAIT] = {
+ .procname = "nf_conntrack_tcp_timeout_fin_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE_WAIT] = {
+ .procname = "nf_conntrack_tcp_timeout_close_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_LAST_ACK] = {
+ .procname = "nf_conntrack_tcp_timeout_last_ack",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_TIME_WAIT] = {
+ .procname = "nf_conntrack_tcp_timeout_time_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE] = {
+ .procname = "nf_conntrack_tcp_timeout_close",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS] = {
+ .procname = "nf_conntrack_tcp_timeout_max_retrans",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK] = {
+ .procname = "nf_conntrack_tcp_timeout_unacknowledged",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = {
+ .procname = "nf_flowtable_tcp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
+ [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
+ .procname = "nf_conntrack_tcp_loose",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = {
+ .procname = "nf_conntrack_tcp_be_liberal",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+ .procname = "nf_conntrack_tcp_ignore_invalid_rst",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
+ .procname = "nf_conntrack_tcp_max_retrans",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = {
+ .procname = "nf_conntrack_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM] = {
+ .procname = "nf_conntrack_udp_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = {
+ .procname = "nf_flowtable_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
+ .procname = "nf_conntrack_icmp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6] = {
+ .procname = "nf_conntrack_icmpv6_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_CLOSED] = {
+ .procname = "nf_conntrack_sctp_timeout_closed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_WAIT] = {
+ .procname = "nf_conntrack_sctp_timeout_cookie_wait",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_COOKIE_ECHOED] = {
+ .procname = "nf_conntrack_sctp_timeout_cookie_echoed",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ESTABLISHED] = {
+ .procname = "nf_conntrack_sctp_timeout_established",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_SENT] = {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_RECD] = {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_recd",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = {
+ .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT] = {
+ .procname = "nf_conntrack_sctp_timeout_heartbeat_sent",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = {
+ .procname = "nf_conntrack_gre_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM] = {
+ .procname = "nf_conntrack_gre_timeout_stream",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
},
#endif
- { }
};
static struct ctl_table nf_ct_netfilter_table[] = {
@@ -635,61 +928,143 @@ static struct ctl_table nf_ct_netfilter_table[] = {
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
},
- { }
};
+static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
+ struct ctl_table *table)
+{
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+#define XASSIGN(XNAME, tn) \
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_ ## XNAME].data = \
+ &(tn)->timeouts[TCP_CONNTRACK_ ## XNAME]
+
+ XASSIGN(SYN_SENT, tn);
+ XASSIGN(SYN_RECV, tn);
+ XASSIGN(ESTABLISHED, tn);
+ XASSIGN(FIN_WAIT, tn);
+ XASSIGN(CLOSE_WAIT, tn);
+ XASSIGN(LAST_ACK, tn);
+ XASSIGN(TIME_WAIT, tn);
+ XASSIGN(CLOSE, tn);
+ XASSIGN(RETRANS, tn);
+ XASSIGN(UNACK, tn);
+#undef XASSIGN
+#define XASSIGN(XNAME, rval) \
+ table[NF_SYSCTL_CT_PROTO_TCP_ ## XNAME].data = (rval)
+
+ XASSIGN(LOOSE, &tn->tcp_loose);
+ XASSIGN(LIBERAL, &tn->tcp_be_liberal);
+ XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+ XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
+#undef XASSIGN
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
+#endif
+
+}
+
+static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
+ struct ctl_table *table)
+{
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ struct nf_sctp_net *sn = nf_sctp_pernet(net);
+
+#define XASSIGN(XNAME, sn) \
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_ ## XNAME].data = \
+ &(sn)->timeouts[SCTP_CONNTRACK_ ## XNAME]
+
+ XASSIGN(CLOSED, sn);
+ XASSIGN(COOKIE_WAIT, sn);
+ XASSIGN(COOKIE_ECHOED, sn);
+ XASSIGN(ESTABLISHED, sn);
+ XASSIGN(SHUTDOWN_SENT, sn);
+ XASSIGN(SHUTDOWN_RECD, sn);
+ XASSIGN(SHUTDOWN_ACK_SENT, sn);
+ XASSIGN(HEARTBEAT_SENT, sn);
+#undef XASSIGN
+#endif
+}
+
+static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
+ struct ctl_table *table)
+{
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ struct nf_gre_net *gn = nf_gre_pernet(net);
+
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE].data = &gn->timeouts[GRE_CT_UNREPLIED];
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM].data = &gn->timeouts[GRE_CT_REPLIED];
+#endif
+}
+
static int nf_conntrack_standalone_init_sysctl(struct net *net)
{
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ struct nf_udp_net *un = nf_udp_pernet(net);
struct ctl_table *table;
+ BUILD_BUG_ON(ARRAY_SIZE(nf_ct_sysctl_table) != NF_SYSCTL_CT_LAST_SYSCTL);
+
table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table),
GFP_KERNEL);
if (!table)
- goto out_kmemdup;
+ return -ENOMEM;
- table[NF_SYSCTL_CT_COUNT].data = &net->ct.count;
+ table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
+ table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events;
#endif
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns) {
- table[NF_SYSCTL_CT_MAX].procname = NULL;
- table[NF_SYSCTL_CT_ACCT].procname = NULL;
- table[NF_SYSCTL_CT_HELPER].procname = NULL;
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL;
+ table[NF_SYSCTL_CT_TIMESTAMP].data = &net->ct.sysctl_tstamp;
#endif
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- table[NF_SYSCTL_CT_EVENTS].procname = NULL;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_GENERIC].data = &nf_generic_pernet(net)->timeout;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP].data = &nf_icmp_pernet(net)->timeout;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED];
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
#endif
- }
- if (!net_eq(&init_net, net))
+ nf_conntrack_standalone_init_tcp_sysctl(net, table);
+ nf_conntrack_standalone_init_sctp_sysctl(net, table);
+ nf_conntrack_standalone_init_gre_sysctl(net, table);
+
+ /* Don't allow non-init_net ns to alter global sysctls */
+ if (!net_eq(&init_net, net)) {
+ table[NF_SYSCTL_CT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ }
- net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
- if (!net->ct.sysctl_header)
+ cnet->sysctl_header = register_net_sysctl_sz(net, "net/netfilter",
+ table,
+ ARRAY_SIZE(nf_ct_sysctl_table));
+ if (!cnet->sysctl_header)
goto out_unregister_netfilter;
return 0;
out_unregister_netfilter:
kfree(table);
-out_kmemdup:
return -ENOMEM;
}
static void nf_conntrack_standalone_fini_sysctl(struct net *net)
{
- struct ctl_table *table;
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+ const struct ctl_table *table;
- table = net->ct.sysctl_header->ctl_table_arg;
- unregister_net_sysctl_table(net->ct.sysctl_header);
+ table = cnet->sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(cnet->sysctl_header);
kfree(table);
}
#else
@@ -703,31 +1078,47 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
}
#endif /* CONFIG_SYSCTL */
+static void nf_conntrack_fini_net(struct net *net)
+{
+ if (enable_hooks)
+ nf_ct_netns_put(net, NFPROTO_INET);
+
+ nf_conntrack_standalone_fini_proc(net);
+ nf_conntrack_standalone_fini_sysctl(net);
+}
+
static int nf_conntrack_pernet_init(struct net *net)
{
int ret;
- ret = nf_conntrack_init_net(net);
+ net->ct.sysctl_checksum = 1;
+
+ ret = nf_conntrack_standalone_init_sysctl(net);
if (ret < 0)
- goto out_init;
+ return ret;
ret = nf_conntrack_standalone_init_proc(net);
if (ret < 0)
goto out_proc;
- net->ct.sysctl_checksum = 1;
- net->ct.sysctl_log_invalid = 0;
- ret = nf_conntrack_standalone_init_sysctl(net);
+ ret = nf_conntrack_init_net(net);
if (ret < 0)
- goto out_sysctl;
+ goto out_init_net;
+
+ if (enable_hooks) {
+ ret = nf_ct_netns_get(net, NFPROTO_INET);
+ if (ret < 0)
+ goto out_hooks;
+ }
return 0;
-out_sysctl:
+out_hooks:
+ nf_conntrack_cleanup_net(net);
+out_init_net:
nf_conntrack_standalone_fini_proc(net);
out_proc:
- nf_conntrack_cleanup_net(net);
-out_init:
+ nf_conntrack_standalone_fini_sysctl(net);
return ret;
}
@@ -735,10 +1126,9 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
{
struct net *net;
- list_for_each_entry(net, net_exit_list, exit_list) {
- nf_conntrack_standalone_fini_sysctl(net);
- nf_conntrack_standalone_fini_proc(net);
- }
+ list_for_each_entry(net, net_exit_list, exit_list)
+ nf_conntrack_fini_net(net);
+
nf_conntrack_cleanup_net_list(net_exit_list);
}
@@ -755,7 +1145,6 @@ static int __init nf_conntrack_standalone_init(void)
if (ret < 0)
goto out_start;
- BUILD_BUG_ON(SKB_NFCT_PTRMASK != NFCT_PTRMASK);
BUILD_BUG_ON(NFCT_INFOMASK <= IP_CT_NUMBER);
#ifdef CONFIG_SYSCTL
@@ -770,11 +1159,12 @@ static int __init nf_conntrack_standalone_init(void)
nf_conntrack_htable_size_user = nf_conntrack_htable_size;
#endif
+ nf_conntrack_init_end();
+
ret = register_pernet_subsys(&nf_conntrack_net_ops);
if (ret < 0)
goto out_pernet;
- nf_conntrack_init_end();
return 0;
out_pernet:
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index 548b673b3625..80ee53f29f68 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -1,8 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -20,11 +18,13 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <linux/netfilter/nf_conntrack_tftp.h>
+#define HELPER_NAME "tftp"
+
MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
MODULE_DESCRIPTION("TFTP connection tracking helper");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ip_conntrack_tftp");
-MODULE_ALIAS_NFCT_HELPER("tftp");
+MODULE_ALIAS_NFCT_HELPER(HELPER_NAME);
#define MAX_PORTS 8
static unsigned short ports[MAX_PORTS];
@@ -78,7 +78,7 @@ static int tftp_help(struct sk_buff *skb,
nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook);
if (nf_nat_tftp && ct->status & IPS_NAT_MASK)
ret = nf_nat_tftp(skb, ctinfo, exp);
- else if (nf_ct_expect_related(exp) != 0) {
+ else if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, ct, "cannot add expectation");
ret = NF_DROP;
}
@@ -119,12 +119,14 @@ static int __init nf_conntrack_tftp_init(void)
ports[ports_c++] = TFTP_PORT;
for (i = 0; i < ports_c; i++) {
- nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp",
- TFTP_PORT, ports[i], i, &tftp_exp_policy,
- 0, tftp_help, NULL, THIS_MODULE);
- nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp",
- TFTP_PORT, ports[i], i, &tftp_exp_policy,
- 0, tftp_help, NULL, THIS_MODULE);
+ nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP,
+ HELPER_NAME, TFTP_PORT, ports[i], i,
+ &tftp_exp_policy, 0, tftp_help, NULL,
+ THIS_MODULE);
+ nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP,
+ HELPER_NAME, TFTP_PORT, ports[i], i,
+ &tftp_exp_policy, 0, tftp_help, NULL,
+ THIS_MODULE);
}
ret = nf_conntrack_helpers_register(tftp, ports_c * 2);
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 91fbd183da2d..0cc584d3dbb1 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
* (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation (or any later at your option).
*/
#include <linux/types.h>
@@ -22,21 +19,24 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_timeout.h>
-struct nf_ct_timeout *
-(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
-
-void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook);
+const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_timeout_hook);
static int untimeout(struct nf_conn *ct, void *timeout)
{
struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext && (!timeout || timeout_ext->timeout == timeout))
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ if (timeout_ext) {
+ const struct nf_ct_timeout *t;
+
+ t = rcu_access_pointer(timeout_ext->timeout);
+
+ if (!timeout || t == timeout)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
/* We are not intended to delete this conntrack. */
return 0;
@@ -44,25 +44,103 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = timeout,
+ };
+
+ nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
-static const struct nf_ct_ext_type timeout_extend = {
- .len = sizeof(struct nf_conn_timeout),
- .align = __alignof__(struct nf_conn_timeout),
- .id = NF_CT_EXT_TIMEOUT,
-};
+static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout)
+{
+ const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook);
+
+ if (h)
+ h->timeout_put(timeout);
+}
-int nf_conntrack_timeout_init(void)
+int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
+ u8 l3num, u8 l4num, const char *timeout_name)
{
- int ret = nf_ct_extend_register(&timeout_extend);
- if (ret < 0)
- pr_err("nf_ct_timeout: Unable to register timeout extension.\n");
+ const struct nf_ct_timeout_hooks *h;
+ struct nf_ct_timeout *timeout;
+ struct nf_conn_timeout *timeout_ext;
+ const char *errmsg = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ h = rcu_dereference(nf_ct_timeout_hook);
+ if (!h) {
+ ret = -ENOENT;
+ errmsg = "Timeout policy base is empty";
+ goto out;
+ }
+
+ timeout = h->timeout_find_get(net, timeout_name);
+ if (!timeout) {
+ ret = -ENOENT;
+ pr_info_ratelimited("No such timeout policy \"%s\"\n",
+ timeout_name);
+ goto out;
+ }
+
+ if (timeout->l3num != l3num) {
+ ret = -EINVAL;
+ pr_info_ratelimited("Timeout policy `%s' can only be used by "
+ "L%d protocol number %d\n",
+ timeout_name, 3, timeout->l3num);
+ goto err_put_timeout;
+ }
+ /* Make sure the timeout policy matches any existing protocol tracker,
+ * otherwise default to generic.
+ */
+ if (timeout->l4proto->l4proto != l4num) {
+ ret = -EINVAL;
+ pr_info_ratelimited("Timeout policy `%s' can only be used by "
+ "L%d protocol number %d\n",
+ timeout_name, 4, timeout->l4proto->l4proto);
+ goto err_put_timeout;
+ }
+ timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
+ if (!timeout_ext) {
+ ret = -ENOMEM;
+ goto err_put_timeout;
+ }
+
+ rcu_read_unlock();
+ return ret;
+
+err_put_timeout:
+ __nf_ct_timeout_put(timeout);
+out:
+ rcu_read_unlock();
+ if (errmsg)
+ pr_info_ratelimited("%s\n", errmsg);
return ret;
}
+EXPORT_SYMBOL_GPL(nf_ct_set_timeout);
-void nf_conntrack_timeout_fini(void)
+void nf_ct_destroy_timeout(struct nf_conn *ct)
{
- nf_ct_extend_unregister(&timeout_extend);
+ struct nf_conn_timeout *timeout_ext;
+ const struct nf_ct_timeout_hooks *h;
+
+ rcu_read_lock();
+ h = rcu_dereference(nf_ct_timeout_hook);
+
+ if (h) {
+ timeout_ext = nf_ct_timeout_find(ct);
+ if (timeout_ext) {
+ struct nf_ct_timeout *t;
+
+ t = rcu_dereference(timeout_ext->timeout);
+ if (t)
+ h->timeout_put(t);
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+ }
+ }
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index 705b912bd91f..9e43a0a59e73 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation (or any later at your option).
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -22,27 +19,7 @@ static bool nf_ct_tstamp __read_mostly;
module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
-static const struct nf_ct_ext_type tstamp_extend = {
- .len = sizeof(struct nf_conn_tstamp),
- .align = __alignof__(struct nf_conn_tstamp),
- .id = NF_CT_EXT_TSTAMP,
-};
-
void nf_conntrack_tstamp_pernet_init(struct net *net)
{
net->ct.sysctl_tstamp = nf_ct_tstamp;
}
-
-int nf_conntrack_tstamp_init(void)
-{
- int ret;
- ret = nf_ct_extend_register(&tstamp_extend);
- if (ret < 0)
- pr_err("Unable to register extension\n");
- return ret;
-}
-
-void nf_conntrack_tstamp_fini(void)
-{
- nf_ct_extend_unregister(&tstamp_extend);
-}
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index f4a566e67213..fab8b9011098 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -13,15 +10,48 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
-static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+#define NF_RECURSION_LIMIT 2
+
+#ifndef CONFIG_PREEMPT_RT
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+}
+#else
+
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return &current->net_xmit.nf_dup_skb_recursion;
+}
+
+#endif
+
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
+ enum nf_dev_hooks hook)
{
- if (skb_mac_header_was_set(skb))
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
+
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
+ goto err;
+
+ if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
+ if (skb_cow_head(skb, skb->mac_len))
+ goto err;
+
skb_push(skb, skb->mac_len);
+ }
skb->dev = dev;
+ skb_clear_tstamp(skb);
+ (*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
+ (*nf_dup_skb_recursion)--;
+ return;
+err:
+ kfree_skb(skb);
}
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
@@ -34,7 +64,7 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
return;
}
- nf_do_netdev_egress(pkt->skb, dev);
+ nf_do_netdev_egress(pkt->skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
@@ -49,9 +79,30 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
skb = skb_clone(pkt->skb, GFP_ATOMIC);
if (skb)
- nf_do_netdev_egress(skb, dev);
+ nf_do_netdev_egress(skb, dev, nft_hook(pkt));
}
EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
+int nft_fwd_dup_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ enum flow_action_id id, int oif)
+{
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+
+ /* nft_flow_rule_destroy() releases the reference on this device. */
+ dev = dev_get_by_index(ctx->net, oif);
+ if (!dev)
+ return -EOPNOTSUPP;
+
+ entry = &flow->rule->action.entries[ctx->num_actions++];
+ entry->id = id;
+ entry->dev = dev;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_fwd_dup_netdev_offload);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter packet duplication support");
diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c
new file mode 100644
index 000000000000..4a5f5195f2d2
--- /dev/null
+++ b/net/netfilter/nf_flow_table_bpf.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Flow Table Helpers for XDP hook
+ *
+ * These are called from the XDP programs.
+ * Note that it is allowed to break compatibility for these functions since
+ * the interface they are exposed through to BPF programs is explicitly
+ * unstable.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <net/xdp.h>
+
+/* bpf_flowtable_opts - options for bpf flowtable helpers
+ * @error: out parameter, set for any encountered error
+ */
+struct bpf_flowtable_opts {
+ s32 error;
+};
+
+enum {
+ NF_BPF_FLOWTABLE_OPTS_SZ = 4,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in nf_flow_table BTF");
+
+__bpf_kfunc_start_defs();
+
+static struct flow_offload_tuple_rhash *
+bpf_xdp_flow_tuple_lookup(struct net_device *dev,
+ struct flow_offload_tuple *tuple, __be16 proto)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *nf_flow_table;
+ struct flow_offload *nf_flow;
+
+ nf_flow_table = nf_flowtable_by_dev(dev);
+ if (!nf_flow_table)
+ return ERR_PTR(-ENOENT);
+
+ tuplehash = flow_offload_lookup(nf_flow_table, tuple);
+ if (!tuplehash)
+ return ERR_PTR(-ENOENT);
+
+ nf_flow = container_of(tuplehash, struct flow_offload,
+ tuplehash[tuplehash->tuple.dir]);
+ flow_offload_refresh(nf_flow_table, nf_flow, false);
+
+ return tuplehash;
+}
+
+__bpf_kfunc struct flow_offload_tuple_rhash *
+bpf_xdp_flow_lookup(struct xdp_md *ctx, struct bpf_fib_lookup *fib_tuple,
+ struct bpf_flowtable_opts *opts, u32 opts_len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct flow_offload_tuple tuple = {
+ .iifidx = fib_tuple->ifindex,
+ .l3proto = fib_tuple->family,
+ .l4proto = fib_tuple->l4_protocol,
+ .src_port = fib_tuple->sport,
+ .dst_port = fib_tuple->dport,
+ };
+ struct flow_offload_tuple_rhash *tuplehash;
+ __be16 proto;
+
+ if (opts_len != NF_BPF_FLOWTABLE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ switch (fib_tuple->family) {
+ case AF_INET:
+ tuple.src_v4.s_addr = fib_tuple->ipv4_src;
+ tuple.dst_v4.s_addr = fib_tuple->ipv4_dst;
+ proto = htons(ETH_P_IP);
+ break;
+ case AF_INET6:
+ tuple.src_v6 = *(struct in6_addr *)&fib_tuple->ipv6_src;
+ tuple.dst_v6 = *(struct in6_addr *)&fib_tuple->ipv6_dst;
+ proto = htons(ETH_P_IPV6);
+ break;
+ default:
+ opts->error = -EAFNOSUPPORT;
+ return NULL;
+ }
+
+ tuplehash = bpf_xdp_flow_tuple_lookup(xdp->rxq->dev, &tuple, proto);
+ if (IS_ERR(tuplehash)) {
+ opts->error = PTR_ERR(tuplehash);
+ return NULL;
+ }
+
+ return tuplehash;
+}
+
+__diag_pop()
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_ft_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_KFUNCS_END(nf_ft_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_flow_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_ft_kfunc_set,
+};
+
+int nf_flow_register_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_flow_kfunc_set);
+}
+EXPORT_SYMBOL_GPL(nf_flow_register_bpf);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index c0c72ae9df42..06e8251a6644 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -10,26 +11,18 @@
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-struct flow_offload_entry {
- struct flow_offload flow;
- struct nf_conn *ct;
- struct rcu_head rcu_head;
-};
-
static DEFINE_MUTEX(flowtable_lock);
static LIST_HEAD(flowtables);
static void
-flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
- struct nf_flow_route *route,
+flow_offload_fill_dir(struct flow_offload *flow,
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
- struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
- struct dst_entry *dst = route->tuple[dir].dst;
+ struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
ft->dir = dir;
@@ -37,115 +30,239 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
case NFPROTO_IPV4:
ft->src_v4 = ctt->src.u3.in;
ft->dst_v4 = ctt->dst.u3.in;
- ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
break;
case NFPROTO_IPV6:
ft->src_v6 = ctt->src.u3.in6;
ft->dst_v6 = ctt->dst.u3.in6;
- ft->mtu = ip6_dst_mtu_forward(dst);
break;
}
ft->l3proto = ctt->src.l3num;
ft->l4proto = ctt->dst.protonum;
- ft->src_port = ctt->src.u.tcp.port;
- ft->dst_port = ctt->dst.u.tcp.port;
- ft->iifidx = other_dst->dev->ifindex;
- ft->oifidx = dst->dev->ifindex;
- ft->dst_cache = dst;
+ switch (ctt->dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+ break;
+ }
}
-struct flow_offload *
-flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
{
- struct flow_offload_entry *entry;
struct flow_offload *flow;
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(nf_ct_is_dying(ct)))
return NULL;
- entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
- if (!entry)
- goto err_ct_refcnt;
-
- flow = &entry->flow;
-
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
- goto err_dst_cache_original;
-
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
- goto err_dst_cache_reply;
+ flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+ if (!flow)
+ return NULL;
- entry->ct = ct;
+ refcount_inc(&ct->ct_general.use);
+ flow->ct = ct;
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
- flow->flags |= FLOW_OFFLOAD_SNAT;
+ __set_bit(NF_FLOW_SNAT, &flow->flags);
if (ct->status & IPS_DST_NAT)
- flow->flags |= FLOW_OFFLOAD_DNAT;
+ __set_bit(NF_FLOW_DNAT, &flow->flags);
return flow;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
-err_dst_cache_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-err_dst_cache_original:
- kfree(entry);
-err_ct_refcnt:
- nf_ct_put(ct);
+static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
+{
+ if (flow_tuple->l3proto == NFPROTO_IPV6)
+ return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_offload_alloc);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
{
- tcp->state = TCP_CONNTRACK_ESTABLISHED;
- tcp->seen[0].td_maxwin = 0;
- tcp->seen[1].td_maxwin = 0;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ route->tuple[dir].dst = NULL;
+
+ return dst;
}
-#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
-#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
+static int flow_offload_fill_route(struct flow_offload *flow,
+ struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ struct dst_entry *dst = nft_route_dst_fetch(route, dir);
+ int i, j = 0;
-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
+ break;
+ case NFPROTO_IPV6:
+ flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
+ break;
+ }
+
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
+ j++;
+ }
+
+ flow_tuple->tun = route->tuple[dir].in.tun;
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ dst_release(dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ flow_tuple->ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->dst_cache = dst;
+ flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
+
+ return 0;
+}
+
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
{
- const struct nf_conntrack_l4proto *l4proto;
- unsigned int timeout;
- int l4num;
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
- l4num = nf_ct_protonum(ct);
- if (l4num == IPPROTO_TCP)
- flow_offload_fixup_tcp(&ct->proto.tcp);
+void flow_offload_route_init(struct flow_offload *flow,
+ struct nf_flow_route *route)
+{
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
+ flow->type = NF_FLOW_OFFLOAD_ROUTE;
+}
+EXPORT_SYMBOL_GPL(flow_offload_route_init);
- l4proto = __nf_ct_l4proto_find(l4num);
- if (!l4proto)
- return;
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
- if (l4num == IPPROTO_TCP)
- timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
- else if (l4num == IPPROTO_UDP)
- timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
- else
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
+{
+ struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+ spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ /* Conntrack state is outdated due to offload bypass.
+ * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
+ * TCP reset validation will fail.
+ */
+ tcp->seen[0].td_maxwin = 0;
+ tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ tcp->seen[1].td_maxwin = 0;
+ tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ spin_unlock_bh(&ct->lock);
+}
+
+static void flow_offload_fixup_ct(struct flow_offload *flow)
+{
+ struct nf_conn *ct = flow->ct;
+ struct net *net = nf_ct_net(ct);
+ int l4num = nf_ct_protonum(ct);
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
+ s32 timeout;
+
+ if (l4num == IPPROTO_TCP) {
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
+
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
+
+ } else if (l4num == IPPROTO_UDP) {
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
+ enum udp_conntrack state =
+ test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
+ UDP_CT_REPLIED : UDP_CT_UNREPLIED;
+
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
+ } else {
return;
+ }
- ct->timeout = nfct_time_stamp + timeout;
+ if (expired)
+ timeout -= offload_timeout;
+
+ if (timeout < 0)
+ timeout = 0;
+
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
-void flow_offload_free(struct flow_offload *flow)
+static void flow_offload_route_release(struct flow_offload *flow)
{
- struct flow_offload_entry *e;
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
+}
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
- e = container_of(flow, struct flow_offload_entry, flow);
- if (flow->flags & FLOW_OFFLOAD_DYING)
- nf_ct_delete(e->ct, 0, 0);
- nf_ct_put(e->ct);
- kfree_rcu(e, rcu_head);
+void flow_offload_free(struct flow_offload *flow)
+{
+ switch (flow->type) {
+ case NF_FLOW_OFFLOAD_ROUTE:
+ flow_offload_route_release(flow);
+ break;
+ default:
+ break;
+ }
+ nf_ct_put(flow->ct);
+ kfree_rcu(flow, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
@@ -153,14 +270,14 @@ static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple *tuple = data;
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -169,7 +286,7 @@ static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
return 1;
return 0;
@@ -183,46 +300,94 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = {
.automatic_shrinking = true,
};
+unsigned long flow_offload_get_timeout(struct flow_offload *flow)
+{
+ unsigned long timeout = NF_FLOW_TIMEOUT;
+ struct net *net = nf_ct_net(flow->ct);
+ int l4num = nf_ct_protonum(flow->ct);
+
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ }
+
+ return timeout;
+}
+
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
- flow->timeout = (u32)jiffies;
+ int err;
+
+ flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
+
+ err = rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[0].node,
+ nf_flow_offload_rhash_params);
+ if (err < 0)
+ return err;
+
+ err = rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[1].node,
+ nf_flow_offload_rhash_params);
+ if (err < 0) {
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[0].node,
+ nf_flow_offload_rhash_params);
+ return err;
+ }
+
+ nf_ct_refresh(flow->ct, NF_CT_DAY);
+
+ if (nf_flowtable_hw_offload(flow_table)) {
+ __set_bit(NF_FLOW_HW, &flow->flags);
+ nf_flow_offload_add(flow_table, flow);
+ }
- rhashtable_insert_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- nf_flow_offload_rhash_params);
- rhashtable_insert_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- nf_flow_offload_rhash_params);
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
+void flow_offload_refresh(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, bool force)
+{
+ u32 timeout;
+
+ timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
+ if (force || timeout - READ_ONCE(flow->timeout) > HZ)
+ WRITE_ONCE(flow->timeout, timeout);
+ else
+ return;
+
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
+ return;
+
+ nf_flow_offload_add(flow_table, flow);
+}
+EXPORT_SYMBOL_GPL(flow_offload_refresh);
+
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
nf_flow_offload_rhash_params);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
-
- e = container_of(flow, struct flow_offload_entry, flow);
- clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
-
flow_offload_free(flow);
}
void flow_offload_teardown(struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
- flow->flags |= FLOW_OFFLOAD_TEARDOWN;
-
- e = container_of(flow, struct flow_offload_entry, flow);
- flow_offload_fixup_ct_state(e->ct);
+ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+ if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -241,7 +406,10 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ return NULL;
+
+ if (unlikely(nf_ct_is_dying(flow->ct)))
return NULL;
return tuplehash;
@@ -250,7 +418,8 @@ EXPORT_SYMBOL_GPL(flow_offload_lookup);
static int
nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
+ void (*iter)(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, void *data),
void *data)
{
struct flow_offload_tuple_rhash *tuplehash;
@@ -274,7 +443,7 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
- iter(flow, data);
+ iter(flow_table, flow, data);
}
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
@@ -282,18 +451,144 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
return err;
}
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
+ const struct flow_offload *flow)
+{
+ return flow_table->type->gc && flow_table->type->gc(flow);
+}
+
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return: number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
{
- return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+ u8 state = READ_ONCE(ct->proto.tcp.state);
+
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
}
-static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
{
- struct nf_flowtable *flow_table = data;
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
+static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
+{
+ bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
if (nf_flow_has_expired(flow) ||
- (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
- flow_offload_del(flow_table, flow);
+ nf_ct_is_dying(flow->ct) ||
+ nf_flow_custom_gc(flow_table, flow)) {
+ flow_offload_teardown(flow);
+ teardown = true;
+ } else if (!teardown) {
+ nf_flow_table_extend_ct_timeout(flow->ct);
+ }
+
+ if (teardown) {
+ if (test_bit(NF_FLOW_HW, &flow->flags)) {
+ if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
+ nf_flow_offload_del(flow_table, flow);
+ else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
+ flow_offload_del(flow_table, flow);
+ } else {
+ flow_offload_del(flow_table, flow);
+ }
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
+ } else if (test_bit(NF_FLOW_HW, &flow->flags)) {
+ nf_flow_offload_stats(flow_table, flow);
+ }
+}
+
+void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
}
static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -301,73 +596,53 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
struct nf_flowtable *flow_table;
flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_gc_run(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
-
- return 0;
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
- new_port, true);
+ new_port, false);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -381,25 +656,19 @@ int nf_flow_snat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -413,11 +682,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
@@ -425,7 +692,9 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
{
int err;
- INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+ flow_block_init(&flowtable->flow_block);
+ init_rwsem(&flowtable->flow_block_lock);
err = rhashtable_init(&flowtable->rhashtable,
&nf_flow_offload_rhash_params);
@@ -443,28 +712,28 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
}
EXPORT_SYMBOL_GPL(nf_flow_table_init);
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
- struct flow_offload_entry *e;
-
- e = container_of(flow, struct flow_offload_entry, flow);
if (!dev) {
flow_offload_teardown(flow);
return;
}
- if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
+
+ if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
(flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
flow->tuplehash[1].tuple.iifidx == dev->ifindex))
- flow_offload_dead(flow);
+ flow_offload_teardown(flow);
}
-static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- struct net_device *dev)
+void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
+ struct net_device *dev)
{
nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
+ nf_flow_table_offload_flush(flowtable);
}
void nf_flow_table_cleanup(struct net_device *dev)
@@ -473,7 +742,7 @@ void nf_flow_table_cleanup(struct net_device *dev)
mutex_lock(&flowtable_lock);
list_for_each_entry(flowtable, &flowtables, list)
- nf_flow_table_iterate_cleanup(flowtable, dev);
+ nf_flow_table_gc_cleanup(flowtable, dev);
mutex_unlock(&flowtable_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
@@ -483,12 +752,96 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_lock(&flowtable_lock);
list_del(&flow_table->list);
mutex_unlock(&flowtable_lock);
+
cancel_delayed_work_sync(&flow_table->gc_work);
+ nf_flow_table_offload_flush(flow_table);
+ /* ... no more pending work after this stage ... */
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
+ nf_flow_table_gc_run(flow_table);
+ nf_flow_table_offload_flush_cleanup(flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int nf_flow_table_init_net(struct net *net)
+{
+ net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
+ return net->ft.stat ? 0 : -ENOMEM;
+}
+
+static void nf_flow_table_fini_net(struct net *net)
+{
+ free_percpu(net->ft.stat);
+}
+
+static int nf_flow_table_pernet_init(struct net *net)
+{
+ int ret;
+
+ ret = nf_flow_table_init_net(net);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_init_proc(net);
+ if (ret < 0)
+ goto out_proc;
+
+ return 0;
+
+out_proc:
+ nf_flow_table_fini_net(net);
+ return ret;
+}
+
+static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_flow_table_fini_proc(net);
+ nf_flow_table_fini_net(net);
+ }
+}
+
+static struct pernet_operations nf_flow_table_net_ops = {
+ .init = nf_flow_table_pernet_init,
+ .exit_batch = nf_flow_table_pernet_exit,
+};
+
+static int __init nf_flow_table_module_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_flow_table_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_flow_table_offload_init();
+ if (ret)
+ goto out_offload;
+
+ ret = nf_flow_register_bpf();
+ if (ret)
+ goto out_bpf;
+
+ return 0;
+
+out_bpf:
+ nf_flow_table_offload_exit();
+out_offload:
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
+ return ret;
+}
+
+static void __exit nf_flow_table_module_exit(void)
+{
+ nf_flow_table_offload_exit();
+ unregister_pernet_subsys(&nf_flow_table_net_ops);
+}
+
+module_init(nf_flow_table_module_init);
+module_exit(nf_flow_table_module_exit);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter flow table module");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 99771aa7e7ea..b0f199171932 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -5,12 +6,33 @@
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
+#include <linux/if_vlan.h>
static unsigned int
nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return NF_ACCEPT;
+
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (!nf_flow_pppoe_proto(skb, &proto))
+ return NF_ACCEPT;
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
case htons(ETH_P_IP):
return nf_flow_offload_ip_hook(priv, skb, state);
case htons(ETH_P_IPV6):
@@ -20,16 +42,63 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
return NF_ACCEPT;
}
+static int nf_flow_rule_route_inet(struct net *net,
+ struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ int err;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+ break;
+ default:
+ err = -1;
+ break;
+ }
+
+ return err;
+}
+
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
.init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_inet,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
};
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv4,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+ .family = NFPROTO_IPV6,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_ipv6,
+ .free = nf_flow_table_free,
+ .hook = nf_flow_offload_ipv6_hook,
+ .owner = THIS_MODULE,
+};
+
static int __init nf_flow_inet_module_init(void)
{
+ nft_register_flowtable_type(&flowtable_ipv4);
+ nft_register_flowtable_type(&flowtable_ipv6);
nft_register_flowtable_type(&flowtable_inet);
return 0;
@@ -38,6 +107,8 @@ static int __init nf_flow_inet_module_init(void)
static void __exit nf_flow_inet_module_exit(void)
{
nft_unregister_flowtable_type(&flowtable_inet);
+ nft_unregister_flowtable_type(&flowtable_ipv6);
+ nft_unregister_flowtable_type(&flowtable_ipv4);
}
module_init(nf_flow_inet_module_init);
@@ -45,4 +116,7 @@ module_exit(nf_flow_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */
+MODULE_DESCRIPTION("Netfilter flow table mixed IPv4/IPv6 module");
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 1d291a51cd45..78883343e5d6 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -6,11 +7,14 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <net/gso.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack_acct.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -23,42 +27,33 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
if (proto != IPPROTO_TCP)
return 0;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
}
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
@@ -66,31 +61,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
+static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
{
switch (iph->protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -105,17 +94,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
iph->daddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -130,29 +117,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
iph->saddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- unsigned int thoff, enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, enum flow_offload_tuple_dir dir,
+ struct iphdr *iph)
{
- struct iphdr *iph = ip_hdr(skb);
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir);
+ }
}
static bool ip_has_options(unsigned int thoff)
@@ -160,40 +142,122 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+static void nf_flow_tuple_encap(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ __be16 inner_proto = skb->protocol;
+ struct vlan_ethhdr *veth;
+ struct pppoe_hdr *phdr;
+ struct iphdr *iph;
+ u16 offset = 0;
+ int i = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ tuple->encap[i].id = skb_vlan_tag_get(skb);
+ tuple->encap[i].proto = skb->vlan_proto;
+ i++;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
+ tuple->encap[i].proto = skb->protocol;
+ inner_proto = veth->h_vlan_encapsulated_proto;
+ offset += VLAN_HLEN;
+ break;
+ case htons(ETH_P_PPP_SES):
+ phdr = (struct pppoe_hdr *)skb_network_header(skb);
+ tuple->encap[i].id = ntohs(phdr->sid);
+ tuple->encap[i].proto = skb->protocol;
+ inner_proto = *((__be16 *)(phdr + 1));
+ offset += PPPOE_SES_HLEN;
+ break;
+ }
+
+ if (inner_proto == htons(ETH_P_IP)) {
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ if (iph->protocol == IPPROTO_IPIP) {
+ tuple->tun.dst_v4.s_addr = iph->daddr;
+ tuple->tun.src_v4.s_addr = iph->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPIP;
+ }
+ }
+}
+
+struct nf_flowtable_ctx {
+ const struct net_device *in;
+ u32 offset;
+ u32 hdrsize;
+};
+
+static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
+ u8 ipproto;
- if (!pskb_may_pull(skb, sizeof(*iph)))
+ if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset))
return -1;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
+ thoff += ctx->offset;
+
+ ipproto = iph->protocol;
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ ctx->hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ ctx->hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
return -1;
+ }
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (iph->ttl <= 1)
return -1;
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
+ return -1;
+
+ switch (ipproto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v4.s_addr = iph->saddr;
tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
- tuple->iifidx = dev->ifindex;
+ tuple->l4proto = ipproto;
+ tuple->iifidx = ctx->in->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -210,93 +274,421 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
return true;
}
+static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple)
+{
+ if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH &&
+ tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM)
+ return true;
+
+ return dst_check(tuple->dst_cache, tuple->dst_cookie);
+}
+
+static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ struct dst_entry *dst)
+{
+ skb_orphan(skb);
+ skb_dst_set_noref(skb, dst);
+ dst_output(state->net, state->sk, skb);
+ return NF_STOLEN;
+}
+
+static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
+{
+ struct iphdr *iph;
+ u16 size;
+
+ if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+ return false;
+
+ iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+ size = iph->ihl << 2;
+
+ if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
+ return false;
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (iph->protocol == IPPROTO_IPIP)
+ *psize += size;
+
+ return true;
+}
+
+static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+
+ if (iph->protocol != IPPROTO_IPIP)
+ return;
+
+ skb_pull(skb, iph->ihl << 2);
+ skb_reset_network_header(skb);
+}
+
+static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
+ u32 *offset)
+{
+ __be16 inner_proto = skb->protocol;
+ struct vlan_ethhdr *veth;
+ bool ret = false;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
+ return false;
+
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ if (veth->h_vlan_encapsulated_proto == proto) {
+ *offset += VLAN_HLEN;
+ inner_proto = proto;
+ ret = true;
+ }
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_proto(skb, &inner_proto) &&
+ inner_proto == proto) {
+ *offset += PPPOE_SES_HLEN;
+ ret = true;
+ }
+ break;
+ }
+
+ if (inner_proto == htons(ETH_P_IP))
+ ret = nf_flow_ip4_tunnel_proto(skb, offset);
+
+ return ret;
+}
+
+static void nf_flow_encap_pop(struct sk_buff *skb,
+ struct flow_offload_tuple_rhash *tuplehash)
+{
+ struct vlan_hdr *vlan_hdr;
+ int i;
+
+ for (i = 0; i < tuplehash->tuple.encap_num; i++) {
+ if (skb_vlan_tag_present(skb)) {
+ __vlan_hwaccel_clear_tag(skb);
+ continue;
+ }
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ vlan_hdr = (struct vlan_hdr *)skb->data;
+ __skb_pull(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vlan_hdr);
+ skb_reset_network_header(skb);
+ break;
+ case htons(ETH_P_PPP_SES):
+ skb->protocol = __nf_flow_pppoe_proto(skb);
+ skb_pull(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+ break;
+ }
+ }
+
+ if (skb->protocol == htons(ETH_P_IP))
+ nf_flow_ip4_tunnel_pop(skb);
+}
+
+struct nf_flow_xmit {
+ const void *dest;
+ const void *source;
+ struct net_device *outdev;
+};
+
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ struct nf_flow_xmit *xmit)
+{
+ skb->dev = xmit->outdev;
+ dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
+ xmit->dest, xmit->source, skb->len);
+ dev_queue_xmit(skb);
+
+ return NF_STOLEN;
+}
+
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table, struct sk_buff *skb)
+{
+ struct flow_offload_tuple tuple = {};
+
+ if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
+static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
+{
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ unsigned int thoff, mtu;
+ struct iphdr *iph;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (flow->tuplehash[!dir].tuple.tun_num)
+ mtu -= sizeof(*iph);
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ return 0;
+
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = (iph->ihl * 4) + ctx->offset;
+ if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
+ return 0;
+
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
+ flow_offload_teardown(flow);
+ return 0;
+ }
+
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
+
+ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= ctx->offset;
+
+ iph = ip_hdr(skb);
+ nf_flow_nat_ip(flow, skb, thoff, dir, iph);
+
+ ip_decrease_ttl(iph);
+ skb_clear_tstamp(skb);
+
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
+ return 1;
+}
+
+static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id)
+{
+ int data_len = skb->len + sizeof(__be16);
+ struct ppp_hdr {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *ph;
+ __be16 proto;
+
+ if (skb_cow_head(skb, PPPOE_SES_HLEN))
+ return -1;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ proto = htons(PPP_IP);
+ break;
+ case htons(ETH_P_IPV6):
+ proto = htons(PPP_IPV6);
+ break;
+ default:
+ return -1;
+ }
+
+ __skb_push(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+
+ ph = (struct ppp_hdr *)(skb->data);
+ ph->hdr.ver = 1;
+ ph->hdr.type = 1;
+ ph->hdr.code = 0;
+ ph->hdr.sid = htons(id);
+ ph->hdr.length = htons(data_len);
+ ph->proto = proto;
+ skb->protocol = htons(ETH_P_PPP_SES);
+
+ return 0;
+}
+
+static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+ struct rtable *rt = dst_rtable(tuple->dst_cache);
+ u8 tos = iph->tos, ttl = iph->ttl;
+ __be16 frag_off = iph->frag_off;
+ u32 headroom = sizeof(*iph);
+ int err;
+
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4);
+ if (err)
+ return err;
+
+ skb_set_inner_ipproto(skb, IPPROTO_IPIP);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ err = skb_cow_head(skb, headroom);
+ if (err)
+ return err;
+
+ skb_scrub_packet(skb, true);
+ skb_clear_hash_if_not_l4(skb);
+
+ /* Push down and install the IP header. */
+ skb_push(skb, sizeof(*iph));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off;
+ iph->protocol = tuple->tun.l3_proto;
+ iph->tos = tos;
+ iph->daddr = tuple->tun.src_v4.s_addr;
+ iph->saddr = tuple->tun.dst_v4.s_addr;
+ iph->ttl = ttl;
+ iph->tot_len = htons(skb->len);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+ ip_send_check(iph);
+
+ *ip_daddr = tuple->tun.src_v4.s_addr;
+
+ return 0;
+}
+
+static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ __be32 *ip_daddr)
+{
+ if (tuple->tun_num)
+ return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr);
+
+ return 0;
+}
+
+static int nf_flow_encap_push(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ int i;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ switch (tuple->encap[i].proto) {
+ case htons(ETH_P_8021Q):
+ case htons(ETH_P_8021AD):
+ if (skb_vlan_push(skb, tuple->encap[i].proto,
+ tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0)
+ return -1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct flow_offload_tuple_rhash *tuplehash;
struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
+ struct flow_offload_tuple *other_tuple;
enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
struct flow_offload *flow;
- struct net_device *outdev;
+ struct neighbour *neigh;
struct rtable *rt;
- unsigned int thoff;
- struct iphdr *iph;
- __be32 nexthop;
-
- if (skb->protocol != htons(ETH_P_IP))
- return NF_ACCEPT;
+ __be32 ip_daddr;
+ int ret;
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb);
+ if (!tuplehash)
return NF_ACCEPT;
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
+ ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
return NF_ACCEPT;
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
+ memset(skb->cb, 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->dev->ifindex;
+ IPCB(skb)->flags = IPSKB_FORWARDED;
+ return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+ }
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip_daddr = other_tuple->src_v4.s_addr;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
- (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*iph)))
+ if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0)
return NF_DROP;
- thoff = ip_hdr(skb)->ihl * 4;
- if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
- return NF_ACCEPT;
-
- if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
return NF_DROP;
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- iph = ip_hdr(skb);
- ip_decrease_ttl(iph);
-
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = dst_rtable(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
+ skb_dst_set_noref(skb, &rt->dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
- return NF_STOLEN;
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr,
+ struct ipv6hdr *ip6h)
{
struct tcphdr *tcph;
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
-
- return 0;
}
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
struct udphdr *udph;
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
@@ -304,32 +696,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -344,17 +730,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
ip6h->daddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -369,119 +753,231 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
ip6h->saddr = new_addr;
break;
- default:
- return -1;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
}
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
+ u8 nexthdr;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ thoff = sizeof(*ip6h) + ctx->offset;
+ if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
+ nexthdr = ip6h->nexthdr;
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ ctx->hdrsize = sizeof(struct tcphdr);
+ break;
+ case IPPROTO_UDP:
+ ctx->hdrsize = sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE:
+ ctx->hdrsize = sizeof(struct gre_base_hdr);
+ break;
+#endif
+ default:
+ return -1;
+ }
+
+ if (ip6h->hop_limit <= 1)
return -1;
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ if (!pskb_may_pull(skb, thoff + ctx->hdrsize))
return -1;
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ switch (nexthdr) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ break;
+ case IPPROTO_GRE: {
+ struct gre_base_hdr *greh;
+
+ greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
+ if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
+ return -1;
+ break;
+ }
+ }
+
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
tuple->src_v6 = ip6h->saddr;
tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
- tuple->iifidx = dev->ifindex;
+ tuple->l4proto = nexthdr;
+ tuple->iifidx = ctx->in->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
+static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct flow_offload_tuple_rhash *tuplehash,
+ struct sk_buff *skb)
+{
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ unsigned int thoff, mtu;
+ struct ipv6hdr *ip6h;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+ mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
+ return 0;
+
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
+ thoff = sizeof(*ip6h) + ctx->offset;
+ if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
+ return 0;
+
+ if (!nf_flow_dst_check(&tuplehash->tuple)) {
+ flow_offload_teardown(flow);
+ return 0;
+ }
+
+ if (skb_try_make_writable(skb, thoff + ctx->hdrsize))
+ return -1;
+
+ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
+
+ ip6h = ipv6_hdr(skb);
+ nf_flow_nat_ipv6(flow, skb, dir, ip6h);
+
+ ip6h->hop_limit--;
+ skb_clear_tstamp(skb);
+
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
+ return 1;
+}
+
+static struct flow_offload_tuple_rhash *
+nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
+ struct nf_flowtable *flow_table,
+ struct sk_buff *skb)
+{
+ struct flow_offload_tuple tuple = {};
+
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset))
+ return NULL;
+
+ if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
+ return NULL;
+
+ return flow_offload_lookup(flow_table, &tuple);
+}
+
unsigned int
nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct flow_offload_tuple_rhash *tuplehash;
struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
+ struct flow_offload_tuple *other_tuple;
enum flow_offload_tuple_dir dir;
+ struct nf_flowtable_ctx ctx = {
+ .in = state->in,
+ };
+ struct nf_flow_xmit xmit = {};
+ struct in6_addr *ip6_daddr;
struct flow_offload *flow;
- struct net_device *outdev;
- struct in6_addr *nexthop;
- struct ipv6hdr *ip6h;
+ struct neighbour *neigh;
struct rt6_info *rt;
+ int ret;
- if (skb->protocol != htons(ETH_P_IPV6))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
+ tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb);
if (tuplehash == NULL)
return NF_ACCEPT;
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
+ ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
+ if (ret < 0)
+ return NF_DROP;
+ else if (ret == 0)
return NF_ACCEPT;
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
+ memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+ IP6CB(skb)->iif = skb->dev->ifindex;
+ IP6CB(skb)->flags = IP6SKB_FORWARDED;
+ return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+ }
+
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ip6_daddr = &other_tuple->src_v6;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
-
- if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
- sizeof(*ip6h)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ if (nf_flow_encap_push(skb, other_tuple) < 0)
return NF_DROP;
- if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = dst_rt6_info(tuplehash->tuple.dst_cache);
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr));
+ if (IS_ERR(neigh)) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = neigh->ha;
+ skb_dst_set_noref(skb, &rt->dst);
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx);
+ if (!xmit.outdev) {
+ flow_offload_teardown(flow);
+ return NF_DROP;
+ }
+ xmit.dest = tuplehash->tuple.out.h_dest;
+ xmit.source = tuplehash->tuple.out.h_source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
return NF_DROP;
+ }
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- ip6h = ipv6_hdr(skb);
- ip6h->hop_limit--;
-
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-
- return NF_STOLEN;
+ return nf_flow_queue_xmit(state->net, skb, &xmit);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
new file mode 100644
index 000000000000..d8f7bfd60ac6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -0,0 +1,1241 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <linux/tc_act/tc_csum.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
+
+struct flow_offload_work {
+ struct list_head list;
+ enum flow_cls_command cmd;
+ struct nf_flowtable *flowtable;
+ struct flow_offload *flow;
+ struct work_struct work;
+};
+
+#define NF_FLOW_DISSECTOR(__match, __type, __field) \
+ (__match)->dissector.offset[__type] = \
+ offsetof(struct nf_flow_key, __field)
+
+static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+ unsigned long long enc_keys;
+
+ if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ return;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
+ mask->enc_key_id.keyid = 0xffffffff;
+ enc_keys = BIT_ULL(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+
+ if (ip_tunnel_info_af(tun_info) == AF_INET) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ enc_ipv4);
+ key->enc_ipv4.src = tun_info->key.u.ipv4.dst;
+ key->enc_ipv4.dst = tun_info->key.u.ipv4.src;
+ if (key->enc_ipv4.src)
+ mask->enc_ipv4.src = 0xffffffff;
+ if (key->enc_ipv4.dst)
+ mask->enc_ipv4.dst = 0xffffffff;
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else {
+ memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
+ sizeof(struct in6_addr));
+ memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.src, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&mask->enc_ipv6.src, 0xff,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&mask->enc_ipv6.dst, 0xff,
+ sizeof(struct in6_addr));
+ enc_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ match->dissector.used_keys |= enc_keys;
+}
+
+static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key,
+ struct flow_dissector_key_vlan *mask,
+ u16 vlan_id, __be16 proto)
+{
+ key->vlan_id = vlan_id;
+ mask->vlan_id = VLAN_VID_MASK;
+ key->vlan_tpid = proto;
+ mask->vlan_tpid = 0xffff;
+}
+
+static int nf_flow_rule_match(struct nf_flow_match *match,
+ const struct flow_offload_tuple *tuple,
+ struct dst_entry *other_dst)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+ struct ip_tunnel_info *tun_info;
+ bool vlan_encap = false;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
+
+ if (other_dst && other_dst->lwtstate) {
+ tun_info = lwt_tun_info(other_dst->lwtstate);
+ nf_flow_rule_lwt_match(match, tun_info);
+ }
+
+ if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_TC)
+ key->meta.ingress_ifindex = tuple->tc.iifidx;
+ else
+ key->meta.ingress_ifindex = tuple->iifidx;
+
+ mask->meta.ingress_ifindex = 0xffffffff;
+
+ if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) &&
+ tuple->encap[0].proto == htons(ETH_P_8021Q)) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[0].id,
+ tuple->encap[0].proto);
+ vlan_encap = true;
+ }
+
+ if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) &&
+ tuple->encap[1].proto == htons(ETH_P_8021Q)) {
+ if (vlan_encap) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN,
+ cvlan);
+ nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ } else {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN,
+ vlan);
+ nf_flow_rule_vlan_match(&key->vlan, &mask->vlan,
+ tuple->encap[1].id,
+ tuple->encap[1].proto);
+ }
+ }
+
+ switch (tuple->l3proto) {
+ case AF_INET:
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ key->basic.n_proto = htons(ETH_P_IP);
+ key->ipv4.src = tuple->src_v4.s_addr;
+ mask->ipv4.src = 0xffffffff;
+ key->ipv4.dst = tuple->dst_v4.s_addr;
+ mask->ipv4.dst = 0xffffffff;
+ break;
+ case AF_INET6:
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ key->basic.n_proto = htons(ETH_P_IPV6);
+ key->ipv6.src = tuple->src_v6;
+ memset(&mask->ipv6.src, 0xff, sizeof(mask->ipv6.src));
+ key->ipv6.dst = tuple->dst_v6;
+ memset(&mask->ipv6.dst, 0xff, sizeof(mask->ipv6.dst));
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ mask->control.addr_type = 0xffff;
+ match->dissector.used_keys |= BIT_ULL(key->control.addr_type);
+ mask->basic.n_proto = 0xffff;
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ key->tcp.flags = 0;
+ mask->tcp.flags = cpu_to_be16(be32_to_cpu(TCP_FLAG_RST | TCP_FLAG_FIN) >> 16);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_TCP);
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_GRE:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ key->basic.ip_proto = tuple->l4proto;
+ mask->basic.ip_proto = 0xff;
+
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_META) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL) |
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC);
+
+ switch (tuple->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ key->tp.src = tuple->src_port;
+ mask->tp.src = 0xffff;
+ key->tp.dst = tuple->dst_port;
+ mask->tp.dst = 0xffff;
+
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ break;
+ }
+
+ return 0;
+}
+
+static void flow_offload_mangle(struct flow_action_entry *entry,
+ enum flow_action_mangle_base htype, u32 offset,
+ const __be32 *value, const __be32 *mask)
+{
+ entry->id = FLOW_ACTION_MANGLE;
+ entry->mangle.htype = htype;
+ entry->mangle.offset = offset;
+ memcpy(&entry->mangle.mask, mask, sizeof(u32));
+ memcpy(&entry->mangle.val, value, sizeof(u32));
+}
+
+static inline struct flow_action_entry *
+flow_action_entry_next(struct nf_flow_rule *flow_rule)
+{
+ int i = flow_rule->rule->action.num_entries++;
+
+ return &flow_rule->rule->action.entries[i];
+}
+
+static int flow_offload_eth_src(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ struct net_device *dev = NULL;
+ const unsigned char *addr;
+ u32 mask, val;
+ u16 val16;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ addr = this_tuple->out.h_source;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ dev = dev_get_by_index(net, other_tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ addr = dev->dev_addr;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ mask = ~0xffff0000;
+ memcpy(&val16, addr, 2);
+ val = val16 << 16;
+ flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+ &val, &mask);
+
+ mask = ~0xffffffff;
+ memcpy(&val, addr + 2, 4);
+ flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
+ &val, &mask);
+
+ dev_put(dev);
+
+ return 0;
+}
+
+static int flow_offload_eth_dst(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ const struct dst_entry *dst_cache;
+ unsigned char ha[ETH_ALEN];
+ struct neighbour *n;
+ const void *daddr;
+ u32 mask, val;
+ u8 nud_state;
+ u16 val16;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ether_addr_copy(ha, this_tuple->out.h_dest);
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ daddr = &other_tuple->src_v4;
+ dst_cache = this_tuple->dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -ENOENT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ mask = ~0xffffffff;
+ memcpy(&val, ha, 4);
+ flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0,
+ &val, &mask);
+
+ mask = ~0x0000ffff;
+ memcpy(&val16, ha + 4, 2);
+ val = val16;
+ flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+ &val, &mask);
+
+ return 0;
+}
+
+static void flow_offload_ipv4_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask = ~htonl(0xffffffff);
+ __be32 addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ offset = offsetof(struct iphdr, saddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ offset = offsetof(struct iphdr, daddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+ &addr, &mask);
+}
+
+static void flow_offload_ipv4_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask = ~htonl(0xffffffff);
+ __be32 addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ offset = offsetof(struct iphdr, daddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ offset = offsetof(struct iphdr, saddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+ &addr, &mask);
+}
+
+static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
+ unsigned int offset,
+ const __be32 *addr, const __be32 *mask)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) {
+ entry = flow_action_entry_next(flow_rule);
+ flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ offset + i * sizeof(u32), &addr[i], mask);
+ }
+}
+
+static void flow_offload_ipv6_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const __be32 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+}
+
+static void flow_offload_ipv6_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ u32 mask = ~htonl(0xffffffff);
+ const __be32 *addr;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, daddr);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr32;
+ offset = offsetof(struct ipv6hdr, saddr);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_ipv6_mangle(flow_rule, offset, addr, &mask);
+}
+
+static int flow_offload_l4proto(const struct flow_offload *flow)
+{
+ u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+ u8 type = 0;
+
+ switch (protonum) {
+ case IPPROTO_TCP:
+ type = FLOW_ACT_MANGLE_HDR_TYPE_TCP;
+ break;
+ case IPPROTO_UDP:
+ type = FLOW_ACT_MANGLE_HDR_TYPE_UDP;
+ break;
+ default:
+ break;
+ }
+
+ return type;
+}
+
+static void flow_offload_port_snat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask, port;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port);
+ offset = 0; /* offsetof(struct tcphdr, source); */
+ port = htonl(port << 16);
+ mask = ~htonl(0xffff0000);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port);
+ offset = 0; /* offsetof(struct tcphdr, dest); */
+ port = htonl(port);
+ mask = ~htonl(0xffff);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+ &port, &mask);
+}
+
+static void flow_offload_port_dnat(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ u32 mask, port;
+ u32 offset;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port);
+ offset = 0; /* offsetof(struct tcphdr, dest); */
+ port = htonl(port);
+ mask = ~htonl(0xffff);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = ntohs(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port);
+ offset = 0; /* offsetof(struct tcphdr, source); */
+ port = htonl(port << 16);
+ mask = ~htonl(0xffff0000);
+ break;
+ default:
+ return;
+ }
+
+ flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+ &port, &mask);
+}
+
+static void flow_offload_ipv4_checksum(struct net *net,
+ const struct flow_offload *flow,
+ struct nf_flow_rule *flow_rule)
+{
+ u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+
+ entry->id = FLOW_ACTION_CSUM;
+ entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+
+ switch (protonum) {
+ case IPPROTO_TCP:
+ entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+ break;
+ case IPPROTO_UDP:
+ entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+ break;
+ }
+}
+
+static void flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *this_tuple, *other_tuple;
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+ int ifindex;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ this_tuple = &flow->tuplehash[dir].tuple;
+ ifindex = this_tuple->out.ifidx;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ifindex = other_tuple->iifidx;
+ break;
+ default:
+ return;
+ }
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return;
+
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_REDIRECT;
+ entry->dev = dev;
+}
+
+static void flow_offload_encap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *this_tuple;
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = this_tuple->dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ entry->tunnel = tun_info;
+ }
+ }
+}
+
+static void flow_offload_decap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *other_tuple;
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = other_tuple->dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ }
+ }
+}
+
+static int
+nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *other_tuple;
+ const struct flow_offload_tuple *tuple;
+ int i;
+
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
+ if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+ flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ tuple = &flow->tuplehash[dir].tuple;
+
+ for (i = 0; i < tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ if (tuple->encap[i].proto == htons(ETH_P_8021Q)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_VLAN_POP;
+ }
+ }
+
+ other_tuple = &flow->tuplehash[!dir].tuple;
+
+ for (i = 0; i < other_tuple->encap_num; i++) {
+ struct flow_action_entry *entry;
+
+ if (other_tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ entry = flow_action_entry_next(flow_rule);
+
+ switch (other_tuple->encap[i].proto) {
+ case htons(ETH_P_PPP_SES):
+ entry->id = FLOW_ACTION_PPPOE_PUSH;
+ entry->pppoe.sid = other_tuple->encap[i].id;
+ break;
+ case htons(ETH_P_8021Q):
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ flow_offload_ipv4_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ flow_offload_ipv4_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_SNAT, &flow->flags) ||
+ test_bit(NF_FLOW_DNAT, &flow->flags))
+ flow_offload_ipv4_checksum(net, flow, flow_rule);
+
+ flow_offload_redirect(net, flow, dir, flow_rule);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
+
+int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
+ return -1;
+
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ flow_offload_ipv6_snat(net, flow, dir, flow_rule);
+ flow_offload_port_snat(net, flow, dir, flow_rule);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ flow_offload_ipv6_dnat(net, flow, dir, flow_rule);
+ flow_offload_port_dnat(net, flow, dir, flow_rule);
+ }
+
+ flow_offload_redirect(net, flow, dir, flow_rule);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6);
+
+#define NF_FLOW_RULE_ACTION_MAX 16
+
+static struct nf_flow_rule *
+nf_flow_offload_rule_alloc(struct net *net,
+ const struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir)
+{
+ const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload_tuple *tuple, *other_tuple;
+ struct flow_offload *flow = offload->flow;
+ struct dst_entry *other_dst = NULL;
+ struct nf_flow_rule *flow_rule;
+ int err = -ENOMEM;
+
+ flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
+ if (!flow_rule)
+ goto err_flow;
+
+ flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX);
+ if (!flow_rule->rule)
+ goto err_flow_rule;
+
+ flow_rule->rule->match.dissector = &flow_rule->match.dissector;
+ flow_rule->rule->match.mask = &flow_rule->match.mask;
+ flow_rule->rule->match.key = &flow_rule->match.key;
+
+ tuple = &flow->tuplehash[dir].tuple;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ other_dst = other_tuple->dst_cache;
+
+ err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
+ if (err < 0)
+ goto err_flow_match;
+
+ flow_rule->rule->action.num_entries = 0;
+ if (flowtable->type->action(net, flow, dir, flow_rule) < 0)
+ goto err_flow_match;
+
+ return flow_rule;
+
+err_flow_match:
+ kfree(flow_rule->rule);
+err_flow_rule:
+ kfree(flow_rule);
+err_flow:
+ return NULL;
+}
+
+static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ for (i = 0; i < flow_rule->rule->action.num_entries; i++) {
+ entry = &flow_rule->rule->action.entries[i];
+ if (entry->id != FLOW_ACTION_REDIRECT)
+ continue;
+
+ dev_put(entry->dev);
+ }
+ kfree(flow_rule->rule);
+ kfree(flow_rule);
+}
+
+static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[])
+{
+ int i;
+
+ for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++)
+ __nf_flow_offload_destroy(flow_rule[i]);
+}
+
+static int nf_flow_offload_alloc(const struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule[])
+{
+ struct net *net = read_pnet(&offload->flowtable->net);
+
+ flow_rule[0] = nf_flow_offload_rule_alloc(net, offload,
+ FLOW_OFFLOAD_DIR_ORIGINAL);
+ if (!flow_rule[0])
+ return -ENOMEM;
+
+ flow_rule[1] = nf_flow_offload_rule_alloc(net, offload,
+ FLOW_OFFLOAD_DIR_REPLY);
+ if (!flow_rule[1]) {
+ __nf_flow_offload_destroy(flow_rule[0]);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void nf_flow_offload_init(struct flow_cls_offload *cls_flow,
+ __be16 proto, int priority,
+ enum flow_cls_command cmd,
+ const struct flow_offload_tuple *tuple,
+ struct netlink_ext_ack *extack)
+{
+ cls_flow->common.protocol = proto;
+ cls_flow->common.prio = priority;
+ cls_flow->common.extack = extack;
+ cls_flow->command = cmd;
+ cls_flow->cookie = (unsigned long)tuple;
+}
+
+static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
+ struct flow_offload *flow,
+ struct nf_flow_rule *flow_rule,
+ enum flow_offload_tuple_dir dir,
+ int priority, int cmd,
+ struct flow_stats *stats,
+ struct list_head *block_cb_list)
+{
+ struct flow_cls_offload cls_flow = {};
+ struct netlink_ext_ack extack = {};
+ struct flow_block_cb *block_cb;
+ __be16 proto = ETH_P_ALL;
+ int err, i = 0;
+
+ nf_flow_offload_init(&cls_flow, proto, priority, cmd,
+ &flow->tuplehash[dir].tuple, &extack);
+ if (cmd == FLOW_CLS_REPLACE)
+ cls_flow.rule = flow_rule->rule;
+
+ down_read(&flowtable->flow_block_lock);
+ list_for_each_entry(block_cb, block_cb_list, list) {
+ err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow,
+ block_cb->cb_priv);
+ if (err < 0)
+ continue;
+
+ i++;
+ }
+ up_read(&flowtable->flow_block_lock);
+
+ if (cmd == FLOW_CLS_STATS)
+ memcpy(stats, &cls_flow.stats, sizeof(*stats));
+
+ return i;
+}
+
+static int flow_offload_tuple_add(struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule,
+ enum flow_offload_tuple_dir dir)
+{
+ return nf_flow_offload_tuple(offload->flowtable, offload->flow,
+ flow_rule, dir,
+ offload->flowtable->priority,
+ FLOW_CLS_REPLACE, NULL,
+ &offload->flowtable->flow_block.cb_list);
+}
+
+static void flow_offload_tuple_del(struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir)
+{
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->flowtable->priority,
+ FLOW_CLS_DESTROY, NULL,
+ &offload->flowtable->flow_block.cb_list);
+}
+
+static int flow_offload_rule_add(struct flow_offload_work *offload,
+ struct nf_flow_rule *flow_rule[])
+{
+ int ok_count = 0;
+
+ ok_count += flow_offload_tuple_add(offload, flow_rule[0],
+ FLOW_OFFLOAD_DIR_ORIGINAL);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+ FLOW_OFFLOAD_DIR_REPLY);
+ if (ok_count == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+static void flow_offload_work_add(struct flow_offload_work *offload)
+{
+ struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX];
+ int err;
+
+ err = nf_flow_offload_alloc(offload, flow_rule);
+ if (err < 0)
+ return;
+
+ err = flow_offload_rule_add(offload, flow_rule);
+ if (err < 0)
+ goto out;
+
+ set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+
+out:
+ nf_flow_offload_destroy(flow_rule);
+}
+
+static void flow_offload_work_del(struct flow_offload_work *offload)
+{
+ clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+ set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
+}
+
+static void flow_offload_tuple_stats(struct flow_offload_work *offload,
+ enum flow_offload_tuple_dir dir,
+ struct flow_stats *stats)
+{
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->flowtable->priority,
+ FLOW_CLS_STATS, stats,
+ &offload->flowtable->flow_block.cb_list);
+}
+
+static void flow_offload_work_stats(struct flow_offload_work *offload)
+{
+ struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {};
+ u64 lastused;
+
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
+ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY,
+ &stats[1]);
+
+ lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
+ offload->flow->timeout = max_t(u64, offload->flow->timeout,
+ lastused + flow_offload_get_timeout(offload->flow));
+
+ if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
+ if (stats[0].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ stats[0].pkts, stats[0].bytes);
+ if (stats[1].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_REPLY,
+ stats[1].pkts, stats[1].bytes);
+ }
+}
+
+static void flow_offload_work_handler(struct work_struct *work)
+{
+ struct flow_offload_work *offload;
+ struct net *net;
+
+ offload = container_of(work, struct flow_offload_work, work);
+ net = read_pnet(&offload->flowtable->net);
+ switch (offload->cmd) {
+ case FLOW_CLS_REPLACE:
+ flow_offload_work_add(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_add);
+ break;
+ case FLOW_CLS_DESTROY:
+ flow_offload_work_del(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_del);
+ break;
+ case FLOW_CLS_STATS:
+ flow_offload_work_stats(offload);
+ NF_FLOW_TABLE_STAT_DEC_ATOMIC(net, count_wq_stats);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags);
+ kfree(offload);
+}
+
+static void flow_offload_queue_work(struct flow_offload_work *offload)
+{
+ struct net *net = read_pnet(&offload->flowtable->net);
+
+ if (offload->cmd == FLOW_CLS_REPLACE) {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_add);
+ queue_work(nf_flow_offload_add_wq, &offload->work);
+ } else if (offload->cmd == FLOW_CLS_DESTROY) {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_del);
+ queue_work(nf_flow_offload_del_wq, &offload->work);
+ } else {
+ NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count_wq_stats);
+ queue_work(nf_flow_offload_stats_wq, &offload->work);
+ }
+}
+
+static struct flow_offload_work *
+nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
+ struct flow_offload *flow, unsigned int cmd)
+{
+ struct flow_offload_work *offload;
+
+ if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags))
+ return NULL;
+
+ offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
+ if (!offload) {
+ clear_bit(NF_FLOW_HW_PENDING, &flow->flags);
+ return NULL;
+ }
+
+ offload->cmd = cmd;
+ offload->flow = flow;
+ offload->flowtable = flowtable;
+ INIT_WORK(&offload->work, flow_offload_work_handler);
+
+ return offload;
+}
+
+
+void nf_flow_offload_add(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE);
+ if (!offload)
+ return;
+
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_del(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY);
+ if (!offload)
+ return;
+
+ set_bit(NF_FLOW_HW_DYING, &flow->flags);
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_stats(struct nf_flowtable *flowtable,
+ struct flow_offload *flow)
+{
+ struct flow_offload_work *offload;
+ __s32 delta;
+
+ delta = nf_flow_timeout_delta(flow->timeout);
+ if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10))
+ return;
+
+ offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS);
+ if (!offload)
+ return;
+
+ flow_offload_queue_work(offload);
+}
+
+void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_del_wq);
+ nf_flow_table_gc_run(flowtable);
+ }
+}
+
+void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
+{
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_add_wq);
+ flush_workqueue(nf_flow_offload_del_wq);
+ flush_workqueue(nf_flow_offload_stats_wq);
+ }
+}
+
+static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
+ struct flow_block_offload *bo,
+ enum flow_block_command cmd)
+{
+ struct flow_block_cb *block_cb, *next;
+ int err = 0;
+
+ down_write(&flowtable->flow_block_lock);
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ list_splice(&bo->cb_list, &flowtable->flow_block.cb_list);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+ up_write(&flowtable->flow_block_lock);
+
+ return err;
+}
+
+static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nf_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
+{
+ memset(bo, 0, sizeof(*bo));
+ bo->net = net;
+ bo->block = &flowtable->flow_block;
+ bo->command = cmd;
+ bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ bo->extack = extack;
+ bo->cb_list_head = &flowtable->flow_block.cb_list;
+ INIT_LIST_HEAD(&bo->cb_list);
+}
+
+static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb)
+{
+ struct nf_flowtable *flowtable = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
+
+ nf_flow_table_gc_cleanup(flowtable, dev);
+ down_write(&flowtable->flow_block_lock);
+ list_del(&block_cb->list);
+ list_del(&block_cb->driver_list);
+ flow_block_cb_free(block_cb);
+ up_write(&flowtable->flow_block_lock);
+}
+
+static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
+
+ return flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_FT, flowtable, bo,
+ nf_flow_table_indr_cleanup);
+}
+
+static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
+ down_write(&flowtable->flow_block_lock);
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
+ up_write(&flowtable->flow_block_lock);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+ int err;
+
+ if (!nf_flowtable_hw_offload(flowtable))
+ return nf_flow_offload_xdp_setup(flowtable, dev, cmd);
+
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
+ else
+ err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
+ if (err < 0)
+ return err;
+
+ return nf_flow_table_block_setup(flowtable, &bo, cmd);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
+
+int nf_flow_table_offload_init(void)
+{
+ nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_add_wq)
+ return -ENOMEM;
+
+ nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_del_wq)
+ goto err_del_wq;
+
+ nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_stats_wq)
+ goto err_stats_wq;
+
+ return 0;
+
+err_stats_wq:
+ destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+ destroy_workqueue(nf_flow_offload_add_wq);
+ return -ENOMEM;
+}
+
+void nf_flow_table_offload_exit(void)
+{
+ destroy_workqueue(nf_flow_offload_add_wq);
+ destroy_workqueue(nf_flow_offload_del_wq);
+ destroy_workqueue(nf_flow_offload_stats_wq);
+}
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
new file mode 100644
index 000000000000..f0984cf69a09
--- /dev/null
+++ b/net/netfilter/nf_flow_table_path.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h>
+#include <net/inet_dscp.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nft_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ struct flow_offload_tunnel tun;
+ u8 num_tuns;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ case DEV_PATH_TUN:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
+ if (path->type == DEV_PATH_TUN) {
+ if (info->num_tuns) {
+ info->indev = NULL;
+ break;
+ }
+ info->tun.src_v6 = path->tun.src_v6;
+ info->tun.dst_v6 = path->tun.dst_v6;
+ info->tun.l3_proto = path->tun.l3_proto;
+ info->num_tuns++;
+ } else {
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id =
+ path->encap.id;
+ info->encap[info->num_encaps].proto =
+ path->encap.proto;
+ info->num_encaps++;
+ }
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ if (WARN_ON_ONCE(info->num_encaps-- == 0)) {
+ info->indev = NULL;
+ break;
+ }
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ info->outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (!nft_hook_find_ops_rcu(hook, dev))
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
+ struct flow_offload_tunnel *tun,
+ struct nf_flow_route *route,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *cur_dst = route->tuple[dir].dst;
+ struct dst_entry *tun_dst = NULL;
+ struct flowi fl = {};
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = tun->dst_v4.s_addr;
+ fl.u.ip4.saddr = tun->src_v4.s_addr;
+ fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = tun->dst_v6;
+ fl.u.ip6.saddr = tun->src_v6;
+ fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt));
+ if (!tun_dst)
+ return -ENOENT;
+
+ route->tuple[dir].dst = tun_dst;
+ dst_release(cur_dst);
+
+ return 0;
+}
+
+static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
+ struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+
+ if (info.num_tuns &&
+ !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) {
+ route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
+ route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
+ route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
+ route->tuple[!dir].in.num_tuns = info.num_tuns;
+ }
+
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ struct dst_entry *this_dst = skb_dst(pkt->skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+ fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
+ fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
+ fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
+ fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
+ break;
+ }
+
+ if (!dst_hold_safe(this_dst))
+ return -ENOENT;
+
+ nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+ if (!other_dst) {
+ dst_release(this_dst);
+ return -ENOENT;
+ }
+
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, dir, ft);
+ if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ nft_dev_forward_path(pkt, route, ct, !dir, ft);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_flow_route);
diff --git a/net/netfilter/nf_flow_table_procfs.c b/net/netfilter/nf_flow_table_procfs.c
new file mode 100644
index 000000000000..159b033a43e6
--- /dev/null
+++ b/net/netfilter/nf_flow_table_procfs.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <net/netfilter/nf_flow_table.h>
+
+static void *nf_flow_table_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
+ for (cpu = *pos - 1; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+
+ return NULL;
+}
+
+static void *nf_flow_table_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ int cpu;
+
+ for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
+ if (!cpu_possible(cpu))
+ continue;
+ *pos = cpu + 1;
+ return per_cpu_ptr(net->ft.stat, cpu);
+ }
+ (*pos)++;
+ return NULL;
+}
+
+static void nf_flow_table_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int nf_flow_table_cpu_seq_show(struct seq_file *seq, void *v)
+{
+ const struct nf_flow_table_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "wq_add wq_del wq_stats\n");
+ return 0;
+ }
+
+ seq_printf(seq, "%8d %8d %8d\n",
+ st->count_wq_add,
+ st->count_wq_del,
+ st->count_wq_stats
+ );
+ return 0;
+}
+
+static const struct seq_operations nf_flow_table_cpu_seq_ops = {
+ .start = nf_flow_table_cpu_seq_start,
+ .next = nf_flow_table_cpu_seq_next,
+ .stop = nf_flow_table_cpu_seq_stop,
+ .show = nf_flow_table_cpu_seq_show,
+};
+
+int nf_flow_table_init_proc(struct net *net)
+{
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_net("nf_flowtable", 0444, net->proc_net_stat,
+ &nf_flow_table_cpu_seq_ops,
+ sizeof(struct seq_net_private));
+ return pde ? 0 : -ENOMEM;
+}
+
+void nf_flow_table_fini_proc(struct net *net)
+{
+ remove_proc_entry("nf_flowtable", net->proc_net_stat);
+}
diff --git a/net/netfilter/nf_flow_table_xdp.c b/net/netfilter/nf_flow_table_xdp.c
new file mode 100644
index 000000000000..e1252d042699
--- /dev/null
+++ b/net/netfilter/nf_flow_table_xdp.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct flow_offload_xdp_ft {
+ struct list_head head;
+ struct nf_flowtable *ft;
+ struct rcu_head rcuhead;
+};
+
+struct flow_offload_xdp {
+ struct hlist_node hnode;
+ unsigned long net_device_addr;
+ struct list_head head;
+};
+
+#define NF_XDP_HT_BITS 4
+static DEFINE_HASHTABLE(nf_xdp_hashtable, NF_XDP_HT_BITS);
+static DEFINE_MUTEX(nf_xdp_hashtable_lock);
+
+/* caller must hold rcu read lock */
+struct nf_flowtable *nf_flowtable_by_dev(const struct net_device *dev)
+{
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp *iter;
+
+ hash_for_each_possible_rcu(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ struct flow_offload_xdp_ft *ft_elem;
+
+ /* The user is supposed to insert a given net_device
+ * just into a single nf_flowtable so we always return
+ * the first element here.
+ */
+ ft_elem = list_first_or_null_rcu(&iter->head,
+ struct flow_offload_xdp_ft,
+ head);
+ return ft_elem ? ft_elem->ft : NULL;
+ }
+ }
+
+ return NULL;
+}
+
+static int nf_flowtable_by_dev_insert(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+ struct flow_offload_xdp_ft *ft_elem;
+
+ ft_elem = kzalloc(sizeof(*ft_elem), GFP_KERNEL_ACCOUNT);
+ if (!ft_elem)
+ return -ENOMEM;
+
+ ft_elem->ft = ft;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (!elem) {
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL_ACCOUNT);
+ if (!elem)
+ goto err_unlock;
+
+ elem->net_device_addr = key;
+ INIT_LIST_HEAD(&elem->head);
+ hash_add_rcu(nf_xdp_hashtable, &elem->hnode, key);
+ }
+ list_add_tail_rcu(&ft_elem->head, &elem->head);
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ return 0;
+
+err_unlock:
+ mutex_unlock(&nf_xdp_hashtable_lock);
+ kfree(ft_elem);
+
+ return -ENOMEM;
+}
+
+static void nf_flowtable_by_dev_remove(struct nf_flowtable *ft,
+ const struct net_device *dev)
+{
+ struct flow_offload_xdp *iter, *elem = NULL;
+ unsigned long key = (unsigned long)dev;
+
+ mutex_lock(&nf_xdp_hashtable_lock);
+
+ hash_for_each_possible(nf_xdp_hashtable, iter, hnode, key) {
+ if (key == iter->net_device_addr) {
+ elem = iter;
+ break;
+ }
+ }
+
+ if (elem) {
+ struct flow_offload_xdp_ft *ft_elem, *ft_next;
+
+ list_for_each_entry_safe(ft_elem, ft_next, &elem->head, head) {
+ if (ft_elem->ft == ft) {
+ list_del_rcu(&ft_elem->head);
+ kfree_rcu(ft_elem, rcuhead);
+ }
+ }
+
+ if (list_empty(&elem->head))
+ hash_del_rcu(&elem->hnode);
+ else
+ elem = NULL;
+ }
+
+ mutex_unlock(&nf_xdp_hashtable_lock);
+
+ if (elem) {
+ synchronize_rcu();
+ kfree(elem);
+ }
+}
+
+int nf_flow_offload_xdp_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ return nf_flowtable_by_dev_insert(flowtable, dev);
+ case FLOW_BLOCK_UNBIND:
+ nf_flowtable_by_dev_remove(flowtable, dev);
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
+ return 0;
+}
diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c
new file mode 100644
index 000000000000..2d890dd04ff8
--- /dev/null
+++ b/net/netfilter/nf_hooks_lwtunnel.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sysctl.h>
+#include <net/lwtunnel.h>
+#include <net/netfilter/nf_hooks_lwtunnel.h>
+#include <linux/netfilter.h>
+
+#include "nf_internals.h"
+
+static inline int nf_hooks_lwtunnel_get(void)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return 1;
+ else
+ return 0;
+}
+
+static inline int nf_hooks_lwtunnel_set(int enable)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) {
+ if (!enable)
+ return -EBUSY;
+ } else if (enable) {
+ static_branch_enable(&nf_hooks_lwtunnel_enabled);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+int nf_hooks_lwtunnel_sysctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int proc_nf_hooks_lwtunnel_enabled = 0;
+ struct ctl_table tmp = {
+ .procname = table->procname,
+ .data = &proc_nf_hooks_lwtunnel_enabled,
+ .maxlen = sizeof(int),
+ .mode = table->mode,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ };
+ int ret;
+
+ if (!write)
+ proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get();
+
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0)
+ ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler);
+
+static struct ctl_table nf_lwtunnel_sysctl_table[] = {
+ {
+ .procname = "nf_hooks_lwtunnel",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = nf_hooks_lwtunnel_sysctl_handler,
+ },
+};
+
+static int __net_init nf_lwtunnel_net_init(struct net *net)
+{
+ struct ctl_table_header *hdr;
+ struct ctl_table *table;
+
+ table = nf_lwtunnel_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_lwtunnel_sysctl_table,
+ sizeof(nf_lwtunnel_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ }
+
+ hdr = register_net_sysctl_sz(net, "net/netfilter", table,
+ ARRAY_SIZE(nf_lwtunnel_sysctl_table));
+ if (!hdr)
+ goto err_reg;
+
+ net->nf.nf_lwtnl_dir_header = hdr;
+
+ return 0;
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void __net_exit nf_lwtunnel_net_exit(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->nf.nf_lwtnl_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_lwtnl_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static struct pernet_operations nf_lwtunnel_net_ops = {
+ .init = nf_lwtunnel_net_init,
+ .exit = nf_lwtunnel_net_exit,
+};
+
+int __init netfilter_lwtunnel_init(void)
+{
+ return register_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+
+void netfilter_lwtunnel_fini(void)
+{
+ unregister_pernet_subsys(&nf_lwtunnel_net_ops);
+}
+#else
+int __init netfilter_lwtunnel_init(void) { return 0; }
+void netfilter_lwtunnel_fini(void) {}
+#endif /* CONFIG_SYSCTL */
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index e15779fd58e3..25403023060b 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -6,15 +6,35 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+/* nf_conntrack_netlink.c: applied on tuple filters */
+#define CTA_FILTER_F_CTA_IP_SRC (1 << 0)
+#define CTA_FILTER_F_CTA_IP_DST (1 << 1)
+#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2)
+#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3)
+#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4)
+#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7)
+#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10)
+#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11)
+#define CTA_FILTER_F_MAX (1 << 12)
+#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1)
+#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr
+
/* nf_queue.c */
-int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
- const struct nf_hook_entries *entries, unsigned int index,
- unsigned int verdict);
void nf_queue_nf_hook_drop(struct net *net);
/* nf_log.c */
int __init netfilter_log_init(void);
+#ifdef CONFIG_LWTUNNEL
+/* nf_hooks_lwtunnel.c */
+int __init netfilter_lwtunnel_init(void);
+void netfilter_lwtunnel_fini(void);
+#endif
+
/* core.c */
void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp,
const struct nf_hook_ops *reg);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index a61d6df6e5f6..74cef8bf554c 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -30,10 +31,10 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
int i;
for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
- if (loggers[pf][i] == NULL)
+ log = nft_log_dereference(loggers[pf][i]);
+ if (!log)
continue;
- log = nft_log_dereference(loggers[pf][i]);
if (!strncasecmp(str_logger, log->name, strlen(log->name)))
return log;
}
@@ -124,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger)
}
EXPORT_SYMBOL(nf_log_unregister);
+/**
+ * nf_log_is_registered - Check if any logger is registered for a given
+ * protocol family.
+ *
+ * @pf: Protocol family
+ *
+ * Returns: true if at least one logger is active for @pf, false otherwise.
+ */
+bool nf_log_is_registered(u_int8_t pf)
+{
+ int i;
+
+ if (pf >= NFPROTO_NUMPROTO) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+ if (rcu_access_pointer(loggers[pf][i]))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(nf_log_is_registered);
+
int nf_log_bind_pf(struct net *net, u_int8_t pf,
const struct nf_logger *logger)
{
@@ -150,18 +177,16 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf)
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_logger_request_module(int pf, enum nf_log_type type)
-{
- if (loggers[pf][type] == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-}
-EXPORT_SYMBOL_GPL(nf_logger_request_module);
-
int nf_logger_find_get(int pf, enum nf_log_type type)
{
struct nf_logger *logger;
int ret = -ENOENT;
+ if (pf >= ARRAY_SIZE(loggers))
+ return -EINVAL;
+ if (type >= NF_LOG_TYPE_MAX)
+ return -EINVAL;
+
if (pf == NFPROTO_INET) {
ret = nf_logger_find_get(NFPROTO_IPV4, type);
if (ret < 0)
@@ -176,9 +201,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type)
return 0;
}
- if (rcu_access_pointer(loggers[pf][type]) == NULL)
- request_module("nf-logger-%u-%u", pf, type);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
if (logger == NULL)
@@ -202,11 +224,12 @@ void nf_logger_put(int pf, enum nf_log_type type)
return;
}
- BUG_ON(loggers[pf][type] == NULL);
-
rcu_read_lock();
logger = rcu_dereference(loggers[pf][type]);
- module_put(logger->me);
+ if (!logger)
+ WARN_ON_ONCE(1);
+ else
+ module_put(logger->me);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_logger_put);
@@ -373,7 +396,7 @@ static int seq_show(struct seq_file *s, void *v)
continue;
logger = nft_log_dereference(loggers[*pos][i]);
- seq_printf(s, "%s", logger->name);
+ seq_puts(s, logger->name);
if (i == 0 && loggers[*pos][i + 1] != NULL)
seq_puts(s, ",");
@@ -398,7 +421,7 @@ static const struct seq_operations nflog_seq_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
-static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
+static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO];
static struct ctl_table_header *nf_log_sysctl_fhdr;
static struct ctl_table nf_log_sysctl_ftable[] = {
@@ -409,11 +432,10 @@ static struct ctl_table nf_log_sysctl_ftable[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
-static int nf_log_proc_dostring(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int nf_log_proc_dostring(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
const struct nf_logger *logger;
char buf[NFLOGGER_NAME_LEN];
@@ -452,9 +474,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write,
mutex_lock(&nf_log_mutex);
logger = nft_log_dereference(net->nf.nf_loggers[tindex]);
if (!logger)
- strlcpy(buf, "NONE", sizeof(buf));
+ strscpy(buf, "NONE", sizeof(buf));
else
- strlcpy(buf, logger->name, sizeof(buf));
+ strscpy(buf, logger->name, sizeof(buf));
mutex_unlock(&nf_log_mutex);
r = proc_dostring(&tmp, write, buffer, lenp, ppos);
}
@@ -496,9 +518,10 @@ static int netfilter_log_sysctl_init(struct net *net)
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
table[i].extra2 = net;
- net->nf.nf_log_dir_header = register_net_sysctl(net,
- "net/netfilter/nf_log",
- table);
+ net->nf.nf_log_dir_header = register_net_sysctl_sz(net,
+ "net/netfilter/nf_log",
+ table,
+ ARRAY_SIZE(nf_log_sysctl_table));
if (!net->nf.nf_log_dir_header)
goto err_reg;
@@ -516,7 +539,7 @@ err_alloc:
static void netfilter_log_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->nf.nf_log_dir_header->ctl_table_arg;
unregister_net_sysctl_table(net->nf.nf_log_dir_header);
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
deleted file mode 100644
index 3a0d6880b7c9..000000000000
--- a/net/netfilter/nf_log_common.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset)
-{
- struct udphdr _udph;
- const struct udphdr *uh;
-
- if (proto == IPPROTO_UDP)
- /* Max length: 10 "PROTO=UDP " */
- nf_log_buf_add(m, "PROTO=UDP ");
- else /* Max length: 14 "PROTO=UDPLITE " */
- nf_log_buf_add(m, "PROTO=UDPLITE ");
-
- if (fragment)
- goto out;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
- if (uh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
-
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
- ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
-
-out:
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_udp_header);
-
-int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
- u8 proto, int fragment, unsigned int offset,
- unsigned int logflags)
-{
- struct tcphdr _tcph;
- const struct tcphdr *th;
-
- /* Max length: 10 "PROTO=TCP " */
- nf_log_buf_add(m, "PROTO=TCP ");
-
- if (fragment)
- return 0;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
- if (th == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
- return 1;
- }
-
- /* Max length: 20 "SPT=65535 DPT=65535 " */
- nf_log_buf_add(m, "SPT=%u DPT=%u ",
- ntohs(th->source), ntohs(th->dest));
- /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
- if (logflags & NF_LOG_TCPSEQ) {
- nf_log_buf_add(m, "SEQ=%u ACK=%u ",
- ntohl(th->seq), ntohl(th->ack_seq));
- }
-
- /* Max length: 13 "WINDOW=65535 " */
- nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
- /* Max length: 9 "RES=0x3C " */
- nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
- TCP_RESERVED_BITS) >> 22));
- /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
- if (th->cwr)
- nf_log_buf_add(m, "CWR ");
- if (th->ece)
- nf_log_buf_add(m, "ECE ");
- if (th->urg)
- nf_log_buf_add(m, "URG ");
- if (th->ack)
- nf_log_buf_add(m, "ACK ");
- if (th->psh)
- nf_log_buf_add(m, "PSH ");
- if (th->rst)
- nf_log_buf_add(m, "RST ");
- if (th->syn)
- nf_log_buf_add(m, "SYN ");
- if (th->fin)
- nf_log_buf_add(m, "FIN ");
- /* Max length: 11 "URGP=65535 " */
- nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
- if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
- u_int8_t _opt[60 - sizeof(struct tcphdr)];
- const u_int8_t *op;
- unsigned int i;
- unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
-
- op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
- optsize, _opt);
- if (op == NULL) {
- nf_log_buf_add(m, "OPT (TRUNCATED)");
- return 1;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- nf_log_buf_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- nf_log_buf_add(m, "%02X", op[i]);
-
- nf_log_buf_add(m, ") ");
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
-
-void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
- struct sock *sk)
-{
- if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
- return;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket && sk->sk_socket->file) {
- const struct cred *cred = sk->sk_socket->file->f_cred;
- nf_log_buf_add(m, "UID=%u GID=%u ",
- from_kuid_munged(&init_user_ns, cred->fsuid),
- from_kgid_munged(&init_user_ns, cred->fsgid));
- }
- read_unlock_bh(&sk->sk_callback_lock);
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid);
-
-void
-nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo, const char *prefix)
-{
- const struct net_device *physoutdev __maybe_unused;
- const struct net_device *physindev __maybe_unused;
-
- nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
- '0' + loginfo->u.log.level, prefix,
- in ? in->name : "",
- out ? out->name : "");
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- physindev = nf_bridge_get_physindev(skb);
- if (physindev && in != physindev)
- nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
- physoutdev = nf_bridge_get_physoutdev(skb);
- if (physoutdev && out != physoutdev)
- nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
-#endif
-}
-EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
-
-/* bridge and netdev logging families share this code. */
-void nf_log_l2packet(struct net *net, u_int8_t pf,
- __be16 protocol,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- switch (protocol) {
- case htons(ETH_P_IP):
- nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_IPV6):
- nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- case htons(ETH_P_ARP):
- case htons(ETH_P_RARP):
- nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out,
- loginfo, "%s", prefix);
- break;
- }
-}
-EXPORT_SYMBOL_GPL(nf_log_l2packet);
-
-static int __init nf_log_common_init(void)
-{
- return 0;
-}
-
-static void __exit nf_log_common_exit(void) {}
-
-module_init(nf_log_common_init);
-module_exit(nf_log_common_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
deleted file mode 100644
index 350eb147754d..000000000000
--- a/net/netfilter/nf_log_netdev.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_log.h>
-
-static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out,
- loginfo, prefix);
-}
-
-static struct nf_logger nf_netdev_logger __read_mostly = {
- .name = "nf_log_netdev",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_netdev_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_netdev_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
-}
-
-static void __net_exit nf_log_netdev_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_netdev_logger);
-}
-
-static struct pernet_operations nf_log_netdev_net_ops = {
- .init = nf_log_netdev_net_init,
- .exit = nf_log_netdev_net_exit,
-};
-
-static int __init nf_log_netdev_init(void)
-{
- int ret;
-
- /* Request to load the real packet loggers. */
- nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
-
- ret = register_pernet_subsys(&nf_log_netdev_net_ops);
- if (ret < 0)
- return ret;
-
- nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
- return 0;
-}
-
-static void __exit nf_log_netdev_exit(void)
-{
- unregister_pernet_subsys(&nf_log_netdev_net_ops);
- nf_log_unregister(&nf_netdev_logger);
-}
-
-module_init(nf_log_netdev_init);
-module_exit(nf_log_netdev_exit);
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter netdev packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
new file mode 100644
index 000000000000..86d5fc5d28e3
--- /dev/null
+++ b/net/netfilter/nf_log_syslog.c
@@ -0,0 +1,1085 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static const struct nf_loginfo default_loginfo = {
+ .type = NF_LOG_TYPE_LOG,
+ .u = {
+ .log = {
+ .level = LOGLEVEL_NOTICE,
+ .logflags = NF_LOG_DEFAULT_MASK,
+ },
+ },
+};
+
+struct arppayload {
+ unsigned char mac_src[ETH_ALEN];
+ unsigned char ip_src[4];
+ unsigned char mac_dst[ETH_ALEN];
+ unsigned char ip_dst[4];
+};
+
+/* Guard against containers flooding syslog. */
+static bool nf_log_allowed(const struct net *net)
+{
+ return net_eq(net, &init_net) || sysctl_nf_log_all_netns;
+}
+
+static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb)
+{
+ u16 vid;
+
+ if (!skb_vlan_tag_present(skb))
+ return;
+
+ vid = skb_vlan_tag_get(skb);
+ nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid);
+}
+static void noinline_for_stack
+dump_arp_packet(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int nhoff)
+{
+ const struct arppayload *ap;
+ struct arppayload _arpp;
+ const struct arphdr *ah;
+ unsigned int logflags;
+ struct arphdr _arph;
+
+ ah = skb_header_pointer(skb, nhoff, sizeof(_arph), &_arph);
+ if (!ah) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ if (logflags & NF_LOG_MACDECODE) {
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ }
+
+ nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+ ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+ /* If it's for Ethernet and the lengths are OK, then log the ARP
+ * payload.
+ */
+ if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
+ ah->ar_hln != ETH_ALEN ||
+ ah->ar_pln != sizeof(__be32))
+ return;
+
+ ap = skb_header_pointer(skb, nhoff + sizeof(_arph), sizeof(_arpp), &_arpp);
+ if (!ap) {
+ nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
+ skb->len - sizeof(_arph));
+ return;
+ }
+ nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+ ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+static void
+nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo, const char *prefix,
+ struct net *net)
+{
+ const struct net_device *physoutdev __maybe_unused;
+ const struct net_device *physindev __maybe_unused;
+
+ nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+ '0' + loginfo->u.log.level, prefix,
+ in ? in->name : "",
+ out ? out->name : "");
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ physindev = nf_bridge_get_physindev(skb, net);
+ if (physindev && in != physindev)
+ nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+ physoutdev = nf_bridge_get_physoutdev(skb);
+ if (physoutdev && out != physoutdev)
+ nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
+#endif
+}
+
+static void nf_log_arp_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix, net);
+ dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+ .name = "nf_log_arp",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_arp_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m,
+ struct sock *sk)
+{
+ if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk)))
+ return;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket && sk->sk_socket->file) {
+ const struct cred *cred = sk->sk_socket->file->f_cred;
+
+ nf_log_buf_add(m, "UID=%u GID=%u ",
+ from_kuid_munged(&init_user_ns, cred->fsuid),
+ from_kgid_munged(&init_user_ns, cred->fsgid));
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static noinline_for_stack int
+nf_log_dump_tcp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset,
+ unsigned int logflags)
+{
+ struct tcphdr _tcph;
+ const struct tcphdr *th;
+
+ /* Max length: 10 "PROTO=TCP " */
+ nf_log_buf_add(m, "PROTO=TCP ");
+
+ if (fragment)
+ return 0;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+ if (!th) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u ",
+ ntohs(th->source), ntohs(th->dest));
+ /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+ if (logflags & NF_LOG_TCPSEQ) {
+ nf_log_buf_add(m, "SEQ=%u ACK=%u ",
+ ntohl(th->seq), ntohl(th->ack_seq));
+ }
+
+ /* Max length: 13 "WINDOW=65535 " */
+ nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
+ /* Max length: 9 "RES=0x3C " */
+ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+ TCP_RESERVED_BITS) >> 22));
+ /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */
+ if (th->ae)
+ nf_log_buf_add(m, "AE ");
+ if (th->cwr)
+ nf_log_buf_add(m, "CWR ");
+ if (th->ece)
+ nf_log_buf_add(m, "ECE ");
+ if (th->urg)
+ nf_log_buf_add(m, "URG ");
+ if (th->ack)
+ nf_log_buf_add(m, "ACK ");
+ if (th->psh)
+ nf_log_buf_add(m, "PSH ");
+ if (th->rst)
+ nf_log_buf_add(m, "RST ");
+ if (th->syn)
+ nf_log_buf_add(m, "SYN ");
+ if (th->fin)
+ nf_log_buf_add(m, "FIN ");
+ /* Max length: 11 "URGP=65535 " */
+ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+ if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) {
+ unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr);
+ u8 _opt[60 - sizeof(struct tcphdr)];
+ unsigned int i;
+ const u8 *op;
+
+ op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "OPT (TRUNCATED)");
+ return 1;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+
+ nf_log_buf_add(m, ") ");
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+nf_log_dump_udp_header(struct nf_log_buf *m,
+ const struct sk_buff *skb,
+ u8 proto, int fragment,
+ unsigned int offset)
+{
+ struct udphdr _udph;
+ const struct udphdr *uh;
+
+ if (proto == IPPROTO_UDP)
+ /* Max length: 10 "PROTO=UDP " */
+ nf_log_buf_add(m, "PROTO=UDP ");
+ else /* Max length: 14 "PROTO=UDPLITE " */
+ nf_log_buf_add(m, "PROTO=UDPLITE ");
+
+ if (fragment)
+ goto out;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+ if (!uh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+ return 1;
+ }
+
+ /* Max length: 20 "SPT=65535 DPT=65535 " */
+ nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
+ ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
+
+out:
+ return 0;
+}
+
+/* One level of recursion won't kill us */
+static noinline_for_stack void
+dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int iphoff)
+{
+ const struct iphdr *ih;
+ unsigned int logflags;
+ struct iphdr _iph;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Important fields:
+ * TOS, len, DF/MF, fragment offset, TTL, src, dst, options.
+ * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 "
+ */
+ nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+ if (ntohs(ih->frag_off) & IP_CE)
+ nf_log_buf_add(m, "CE ");
+ if (ntohs(ih->frag_off) & IP_DF)
+ nf_log_buf_add(m, "DF ");
+ if (ntohs(ih->frag_off) & IP_MF)
+ nf_log_buf_add(m, "MF ");
+
+ /* Max length: 11 "FRAG:65535 " */
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+ if ((logflags & NF_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
+ unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+ const unsigned char *op;
+ unsigned int i, optsize;
+
+ optsize = ih->ihl * 4 - sizeof(struct iphdr);
+ op = skb_header_pointer(skb, iphoff + sizeof(_iph),
+ optsize, _opt);
+ if (!op) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 127 "OPT (" 15*4*2chars ") " */
+ nf_log_buf_add(m, "OPT (");
+ for (i = 0; i < optsize; i++)
+ nf_log_buf_add(m, "%02X", op[i]);
+ nf_log_buf_add(m, ") ");
+ }
+
+ switch (ih->protocol) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, ih->protocol,
+ ntohs(ih->frag_off) & IP_OFFSET,
+ iphoff + ih->ihl * 4))
+ return;
+ break;
+ case IPPROTO_ICMP: {
+ static const size_t required_len[NR_ICMP_TYPES + 1] = {
+ [ICMP_ECHOREPLY] = 4,
+ [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr),
+ [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr),
+ [ICMP_REDIRECT] = 8 + sizeof(struct iphdr),
+ [ICMP_ECHO] = 4,
+ [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr),
+ [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr),
+ [ICMP_TIMESTAMP] = 20,
+ [ICMP_TIMESTAMPREPLY] = 20,
+ [ICMP_ADDRESS] = 12,
+ [ICMP_ADDRESSREPLY] = 12 };
+ const struct icmphdr *ich;
+ struct icmphdr _icmph;
+
+ /* Max length: 11 "PROTO=ICMP " */
+ nf_log_buf_add(m, "PROTO=ICMP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_icmph), &_icmph);
+ if (!ich) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ switch (ich->type) {
+ case ICMP_ECHOREPLY:
+ case ICMP_ECHO:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ich->un.echo.id),
+ ntohs(ich->un.echo.sequence));
+ break;
+
+ case ICMP_PARAMETERPROB:
+ /* Max length: 14 "PARAMETER=255 " */
+ nf_log_buf_add(m, "PARAMETER=%u ",
+ ntohl(ich->un.gateway) >> 24);
+ break;
+ case ICMP_REDIRECT:
+ /* Max length: 24 "GATEWAY=255.255.255.255 " */
+ nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+ fallthrough;
+ case ICMP_DEST_UNREACH:
+ case ICMP_SOURCE_QUENCH:
+ case ICMP_TIME_EXCEEDED:
+ /* Max length: 3+maxlen */
+ if (!iphoff) { /* Only recurse once. */
+ nf_log_buf_add(m, "[");
+ dump_ipv4_packet(net, m, info, skb,
+ iphoff + ih->ihl * 4 + sizeof(_icmph));
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohs(ich->un.frag.mtu));
+ }
+ }
+ break;
+ }
+ /* Max Length */
+ case IPPROTO_AH: {
+ const struct ip_auth_hdr *ah;
+ struct ip_auth_hdr _ahdr;
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 9 "PROTO=AH " */
+ nf_log_buf_add(m, "PROTO=AH ");
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ah = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_ahdr), &_ahdr);
+ if (!ah) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ break;
+ }
+ case IPPROTO_ESP: {
+ const struct ip_esp_hdr *eh;
+ struct ip_esp_hdr _esph;
+
+ /* Max length: 10 "PROTO=ESP " */
+ nf_log_buf_add(m, "PROTO=ESP ");
+
+ if (ntohs(ih->frag_off) & IP_OFFSET)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ eh = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+ sizeof(_esph), &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - iphoff - ih->ihl * 4);
+ break;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 " */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+ break;
+ }
+ /* Max length: 10 "PROTO 255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && !iphoff)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (!iphoff && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+ /* Proto Max log string length */
+ /* IP: 40+46+6+11+127 = 230 */
+ /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */
+ /* UDP: 10+max(25,20) = 35 */
+ /* UDPLITE: 14+max(25,20) = 39 */
+ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+ /* ESP: 10+max(25)+15 = 50 */
+ /* AH: 9+max(25)+15 = 49 */
+ /* unknown: 10 */
+
+ /* (ICMP allows recursion one level deep) */
+ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
+ /* maxlen = 230+ 91 + 230 + 255 = 806 */
+}
+
+static noinline_for_stack void
+dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb, unsigned int ip6hoff,
+ int recurse)
+{
+ const struct ipv6hdr *ih;
+ unsigned int hdrlen = 0;
+ unsigned int logflags;
+ struct ipv6hdr _ip6h;
+ unsigned int ptr;
+ u8 currenthdr;
+ int fragment;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+ else
+ logflags = NF_LOG_DEFAULT_MASK;
+
+ ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+ if (!ih) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+ nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+ /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+ ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ ih->hop_limit,
+ (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+ fragment = 0;
+ ptr = ip6hoff + sizeof(struct ipv6hdr);
+ currenthdr = ih->nexthdr;
+ while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) {
+ struct ipv6_opt_hdr _hdr;
+ const struct ipv6_opt_hdr *hp;
+
+ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+ if (!hp) {
+ nf_log_buf_add(m, "TRUNCATED");
+ return;
+ }
+
+ /* Max length: 48 "OPT (...) " */
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, "OPT ( ");
+
+ switch (currenthdr) {
+ case IPPROTO_FRAGMENT: {
+ struct frag_hdr _fhdr;
+ const struct frag_hdr *fh;
+
+ nf_log_buf_add(m, "FRAG:");
+ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+ &_fhdr);
+ if (!fh) {
+ nf_log_buf_add(m, "TRUNCATED ");
+ return;
+ }
+
+ /* Max length: 6 "65535 " */
+ nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+ /* Max length: 11 "INCOMPLETE " */
+ if (fh->frag_off & htons(0x0001))
+ nf_log_buf_add(m, "INCOMPLETE ");
+
+ nf_log_buf_add(m, "ID:%08x ",
+ ntohl(fh->identification));
+
+ if (ntohs(fh->frag_off) & 0xFFF8)
+ fragment = 1;
+
+ hdrlen = 8;
+ break;
+ }
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_HOPOPTS:
+ if (fragment) {
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ")");
+ return;
+ }
+ hdrlen = ipv6_optlen(hp);
+ break;
+ /* Max Length */
+ case IPPROTO_AH:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_auth_hdr _ahdr;
+ const struct ip_auth_hdr *ah;
+
+ /* Max length: 3 "AH " */
+ nf_log_buf_add(m, "AH ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+ &_ahdr);
+ if (!ah) {
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 15 "SPI=0xF1234567 */
+ nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+ }
+
+ hdrlen = ipv6_authlen(hp);
+ break;
+ case IPPROTO_ESP:
+ if (logflags & NF_LOG_IPOPT) {
+ struct ip_esp_hdr _esph;
+ const struct ip_esp_hdr *eh;
+
+ /* Max length: 4 "ESP " */
+ nf_log_buf_add(m, "ESP ");
+
+ if (fragment) {
+ nf_log_buf_add(m, ")");
+ return;
+ }
+
+ /* Max length: 26 "INCOMPLETE [65535 bytes] )" */
+ eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+ &_esph);
+ if (!eh) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Length: 16 "SPI=0xF1234567 )" */
+ nf_log_buf_add(m, "SPI=0x%x )",
+ ntohl(eh->spi));
+ }
+ return;
+ default:
+ /* Max length: 20 "Unknown Ext Hdr 255" */
+ nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
+ return;
+ }
+ if (logflags & NF_LOG_IPOPT)
+ nf_log_buf_add(m, ") ");
+
+ currenthdr = hp->nexthdr;
+ ptr += hdrlen;
+ }
+
+ switch (currenthdr) {
+ case IPPROTO_TCP:
+ if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
+ ptr, logflags))
+ return;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
+ return;
+ break;
+ case IPPROTO_ICMPV6: {
+ struct icmp6hdr _icmp6h;
+ const struct icmp6hdr *ic;
+
+ /* Max length: 13 "PROTO=ICMPv6 " */
+ nf_log_buf_add(m, "PROTO=ICMPv6 ");
+
+ if (fragment)
+ break;
+
+ /* Max length: 25 "INCOMPLETE [65535 bytes] " */
+ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+ if (!ic) {
+ nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+ skb->len - ptr);
+ return;
+ }
+
+ /* Max length: 18 "TYPE=255 CODE=255 " */
+ nf_log_buf_add(m, "TYPE=%u CODE=%u ",
+ ic->icmp6_type, ic->icmp6_code);
+
+ switch (ic->icmp6_type) {
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ /* Max length: 19 "ID=65535 SEQ=65535 " */
+ nf_log_buf_add(m, "ID=%u SEQ=%u ",
+ ntohs(ic->icmp6_identifier),
+ ntohs(ic->icmp6_sequence));
+ break;
+ case ICMPV6_MGM_QUERY:
+ case ICMPV6_MGM_REPORT:
+ case ICMPV6_MGM_REDUCTION:
+ break;
+
+ case ICMPV6_PARAMPROB:
+ /* Max length: 17 "POINTER=ffffffff " */
+ nf_log_buf_add(m, "POINTER=%08x ",
+ ntohl(ic->icmp6_pointer));
+ fallthrough;
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_PKT_TOOBIG:
+ case ICMPV6_TIME_EXCEED:
+ /* Max length: 3+maxlen */
+ if (recurse) {
+ nf_log_buf_add(m, "[");
+ dump_ipv6_packet(net, m, info, skb,
+ ptr + sizeof(_icmp6h), 0);
+ nf_log_buf_add(m, "] ");
+ }
+
+ /* Max length: 10 "MTU=65535 " */
+ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
+ nf_log_buf_add(m, "MTU=%u ",
+ ntohl(ic->icmp6_mtu));
+ }
+ }
+ break;
+ }
+ /* Max length: 10 "PROTO=255 " */
+ default:
+ nf_log_buf_add(m, "PROTO=%u ", currenthdr);
+ }
+
+ /* Max length: 15 "UID=4294967295 " */
+ if ((logflags & NF_LOG_UID) && recurse)
+ nf_log_dump_sk_uid_gid(net, m, skb->sk);
+
+ /* Max length: 16 "MARK=0xFFFFFFFF " */
+ if (recurse && skb->mark)
+ nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_mac_header(struct nf_log_buf *m,
+ const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & NF_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest);
+ nf_log_dump_vlan(m, skb);
+ nf_log_buf_add(m, "MACPROTO=%04x ",
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ nf_log_buf_add(m, "MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ if (dev->type == ARPHRD_SIT) {
+ p -= ETH_HLEN;
+
+ if (p < skb->head)
+ p = NULL;
+ }
+
+ if (p) {
+ nf_log_buf_add(m, "%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++)
+ nf_log_buf_add(m, ":%02x", *p++);
+ }
+
+ if (dev->type == ARPHRD_SIT) {
+ const struct iphdr *iph =
+ (struct iphdr *)skb_mac_header(skb);
+
+ nf_log_buf_add(m, " TUNNEL=%pI4->%pI4", &iph->saddr,
+ &iph->daddr);
+ }
+ }
+ nf_log_buf_add(m, " ");
+}
+
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+ out, loginfo, prefix, net);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv4_packet(net, m, loginfo, skb, skb_network_offset(skb));
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+ .name = "nf_log_ipv4",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum, const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
+ loginfo, prefix, net);
+
+ if (in)
+ dump_mac_header(m, loginfo, skb);
+
+ dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
+
+ nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_ip6_logger __read_mostly = {
+ .name = "nf_log_ipv6",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_ip6_packet,
+ .me = THIS_MODULE,
+};
+
+static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ struct nf_log_buf *m;
+
+ if (!nf_log_allowed(net))
+ return;
+
+ m = nf_log_buf_open();
+
+ if (!loginfo)
+ loginfo = &default_loginfo;
+
+ nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+ prefix, net);
+
+ dump_mac_header(m, loginfo, skb);
+
+ nf_log_buf_close(m);
+}
+
+static void nf_log_netdev_packet(struct net *net, u_int8_t pf,
+ unsigned int hooknum,
+ const struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ const struct nf_loginfo *loginfo,
+ const char *prefix)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ case htons(ETH_P_ARP):
+ case htons(ETH_P_RARP):
+ nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix);
+ break;
+ default:
+ nf_log_unknown_packet(net, pf, hooknum, skb,
+ in, out, loginfo, prefix);
+ break;
+ }
+}
+
+static struct nf_logger nf_netdev_logger __read_mostly = {
+ .name = "nf_log_netdev",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static struct nf_logger nf_bridge_logger __read_mostly = {
+ .name = "nf_log_bridge",
+ .type = NF_LOG_TYPE_LOG,
+ .logfn = nf_log_netdev_packet,
+ .me = THIS_MODULE,
+};
+
+static int __net_init nf_log_syslog_net_init(struct net *net)
+{
+ int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+
+ if (ret)
+ return ret;
+
+ ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+ if (ret)
+ goto err1;
+
+ ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret)
+ goto err2;
+
+ ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret)
+ goto err3;
+
+ ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret)
+ goto err4;
+ return 0;
+err4:
+ nf_log_unset(net, &nf_netdev_logger);
+err3:
+ nf_log_unset(net, &nf_ip6_logger);
+err2:
+ nf_log_unset(net, &nf_arp_logger);
+err1:
+ nf_log_unset(net, &nf_ip_logger);
+ return ret;
+}
+
+static void __net_exit nf_log_syslog_net_exit(struct net *net)
+{
+ nf_log_unset(net, &nf_ip_logger);
+ nf_log_unset(net, &nf_arp_logger);
+ nf_log_unset(net, &nf_ip6_logger);
+ nf_log_unset(net, &nf_netdev_logger);
+ nf_log_unset(net, &nf_bridge_logger);
+}
+
+static struct pernet_operations nf_log_syslog_net_ops = {
+ .init = nf_log_syslog_net_init,
+ .exit = nf_log_syslog_net_exit,
+};
+
+static int __init nf_log_syslog_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&nf_log_syslog_net_ops);
+ if (ret < 0)
+ return ret;
+
+ ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+ if (ret < 0)
+ goto err1;
+
+ ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+ if (ret < 0)
+ goto err2;
+
+ ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+ if (ret < 0)
+ goto err3;
+
+ ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger);
+ if (ret < 0)
+ goto err4;
+
+ ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger);
+ if (ret < 0)
+ goto err5;
+
+ return 0;
+err5:
+ nf_log_unregister(&nf_netdev_logger);
+err4:
+ nf_log_unregister(&nf_ip6_logger);
+err3:
+ nf_log_unregister(&nf_arp_logger);
+err2:
+ nf_log_unregister(&nf_ip_logger);
+err1:
+ pr_err("failed to register logger\n");
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ return ret;
+}
+
+static void __exit nf_log_syslog_exit(void)
+{
+ unregister_pernet_subsys(&nf_log_syslog_net_ops);
+ nf_log_unregister(&nf_ip_logger);
+ nf_log_unregister(&nf_arp_logger);
+ nf_log_unregister(&nf_ip6_logger);
+ nf_log_unregister(&nf_netdev_logger);
+ nf_log_unregister(&nf_bridge_logger);
+}
+
+module_init(nf_log_syslog_init);
+module_exit(nf_log_syslog_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter syslog packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("nf_log_arp");
+MODULE_ALIAS("nf_log_bridge");
+MODULE_ALIAS("nf_log_ipv4");
+MODULE_ALIAS("nf_log_ipv6");
+MODULE_ALIAS("nf_log_netdev");
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0);
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
+MODULE_ALIAS_NF_LOGGER(3, 0);
+MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index e4d61a7a5258..98deef6cde69 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Amanda extension for TCP NAT alteration.
* (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
* based on a copy of HW's ip_nat_irc.c as well as other modules
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -19,10 +15,15 @@
#include <net/netfilter/nf_nat_helper.h>
#include <linux/netfilter/nf_conntrack_amanda.h>
+#define NAT_HELPER_NAME "amanda"
+
MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
MODULE_DESCRIPTION("Amanda NAT helper");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_amanda");
+MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME);
+
+static struct nf_conntrack_nat_helper nat_helper_amanda =
+ NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME);
static unsigned int help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
@@ -43,19 +44,7 @@ static unsigned int help(struct sk_buff *skb,
exp->expectfn = nf_nat_follow_master;
/* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int res;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- res = nf_ct_expect_related(exp);
- if (res == 0)
- break;
- else if (res != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
@@ -74,6 +63,7 @@ static unsigned int help(struct sk_buff *skb,
static void __exit nf_nat_amanda_fini(void)
{
+ nf_nat_helper_unregister(&nat_helper_amanda);
RCU_INIT_POINTER(nf_nat_amanda_hook, NULL);
synchronize_rcu();
}
@@ -81,6 +71,7 @@ static void __exit nf_nat_amanda_fini(void)
static int __init nf_nat_amanda_init(void)
{
BUG_ON(nf_nat_amanda_hook != NULL);
+ nf_nat_helper_register(&nat_helper_amanda);
RCU_INIT_POINTER(nf_nat_amanda_hook, help);
return 0;
}
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
new file mode 100644
index 000000000000..481be15609b1
--- /dev/null
+++ b/net/netfilter/nf_nat_bpf.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable NAT Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+
+__bpf_kfunc_start_defs();
+
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct - Pointer to referenced nf_conn object, obtained using
+ * bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr - Nat source/destination address
+ * @port - Nat source/destination port. Non-positive values are
+ * interpreted as select a random port.
+ * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
+{
+ struct nf_conn *ct = (struct nf_conn *)nfct;
+ u16 proto = nf_ct_l3num(ct);
+ struct nf_nat_range2 range;
+
+ if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+ return -EINVAL;
+
+ memset(&range, 0, sizeof(struct nf_nat_range2));
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = *addr;
+ range.max_addr = range.min_addr;
+ if (port > 0) {
+ range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ range.min_proto.all = cpu_to_be16(port);
+ range.max_proto.all = range.min_proto.all;
+ }
+
+ return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(nf_nat_kfunc_set)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(nf_nat_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &nf_nat_kfunc_set,
+};
+
+int register_nf_nat_bpf(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &nf_bpf_nat_kfunc_set);
+ if (ret)
+ return ret;
+
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &nf_bpf_nat_kfunc_set);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index d159e9e7835b..78a61dac4ade 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -16,32 +13,31 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/xfrm.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/rtnetlink.h>
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
-#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <uapi/linux/netfilter/nf_nat.h>
#include "nf_internals.h"
+#define NF_NAT_MAX_ATTEMPTS 128
+#define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4)
+
static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
static DEFINE_MUTEX(nf_nat_proto_mutex);
-static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
- __read_mostly;
static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
-static unsigned int nf_nat_hash_rnd __read_mostly;
+static siphash_aligned_key_t nf_nat_hash_rnd;
struct nf_nat_lookup_hook_priv {
struct nf_hook_entries __rcu *entries;
@@ -58,16 +54,71 @@ struct nat_net {
struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
};
-inline const struct nf_nat_l3proto *
-__nf_nat_l3proto_find(u8 family)
+#ifdef CONFIG_XFRM
+static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
{
- return rcu_dereference(nf_nat_l3protos[family]);
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi4 *fl4 = &fl->u.ip4;
+
+ if (ct->status & statusbit) {
+ fl4->daddr = t->dst.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl4->saddr = t->src.u3.ip;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl4->fl4_sport = t->src.u.all;
+ }
+}
+
+static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ unsigned long statusbit,
+ struct flowi *fl)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
+ struct flowi6 *fl6 = &fl->u.ip6;
+
+ if (ct->status & statusbit) {
+ fl6->daddr = t->dst.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_dport = t->dst.u.all;
+ }
+
+ statusbit ^= IPS_NAT_MASK;
+
+ if (ct->status & statusbit) {
+ fl6->saddr = t->src.u3.in6;
+ if (t->dst.protonum == IPPROTO_TCP ||
+ t->dst.protonum == IPPROTO_UDP ||
+ t->dst.protonum == IPPROTO_UDPLITE ||
+ t->dst.protonum == IPPROTO_SCTP)
+ fl6->fl6_sport = t->src.u.all;
+ }
+#endif
}
-#ifdef CONFIG_XFRM
static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
{
- const struct nf_nat_l3proto *l3proto;
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
enum ip_conntrack_dir dir;
@@ -79,74 +130,85 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
return;
family = nf_ct_l3num(ct);
- l3proto = __nf_nat_l3proto_find(family);
- if (l3proto == NULL)
- return;
-
dir = CTINFO2DIR(ctinfo);
if (dir == IP_CT_DIR_ORIGINAL)
statusbit = IPS_DST_NAT;
else
statusbit = IPS_SRC_NAT;
- l3proto->decode_session(skb, ct, dir, statusbit, fl);
-}
-
-int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
-{
- struct flowi fl;
- unsigned int hh_len;
- struct dst_entry *dst;
- struct sock *sk = skb->sk;
- int err;
-
- err = xfrm_decode_session(skb, &fl, family);
- if (err < 0)
- return err;
-
- dst = skb_dst(skb);
- if (dst->xfrm)
- dst = ((struct xfrm_dst *)dst)->route;
- if (!dst_hold_safe(dst))
- return -EHOSTUNREACH;
-
- if (sk && !net_eq(net, sock_net(sk)))
- sk = NULL;
-
- dst = xfrm_lookup(net, dst, &fl, sk, 0);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- skb_dst_drop(skb);
- skb_dst_set(skb, dst);
-
- /* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
- if (skb_headroom(skb) < hh_len &&
- pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -ENOMEM;
- return 0;
+ switch (family) {
+ case NFPROTO_IPV4:
+ nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl);
+ return;
+ case NFPROTO_IPV6:
+ nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl);
+ return;
+ }
}
-EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
/* We keep an extra hash for each conntrack, for fast searching. */
static unsigned int
-hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net,
+ const struct nf_conntrack_zone *zone,
+ const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
+ struct {
+ struct nf_conntrack_man src;
+ u32 net_mix;
+ u32 protonum;
+ u32 zone;
+ } __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+ memset(&combined, 0, sizeof(combined));
+
/* Original src, to ensure we map it consistently if poss. */
- hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
- tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
+ combined.src = tuple->src;
+ combined.net_mix = net_hash_mix(net);
+ combined.protonum = tuple->dst.protonum;
+
+ /* Zone ID can be used provided its valid for both directions */
+ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR)
+ combined.zone = zone->id;
+
+ hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
return reciprocal_scale(hash, nf_nat_htable_size);
}
-/* Is this tuple already taken? (not by us) */
-int
+/**
+ * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_conntrack: our (unconfirmed) conntrack entry
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple in either direction.
+ *
+ * Example:
+ * INITIATOR -> NAT/PAT -> RESPONDER
+ *
+ * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite).
+ * Then, later, NAT/PAT itself also connects to RESPONDER.
+ *
+ * This will not work if the SNAT done earlier has same IP:PORT source pair.
+ *
+ * Conntrack table has:
+ * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * and new locally originating connection wants:
+ * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT
+ * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT
+ *
+ * ... which would mean incoming packets cannot be distinguished between
+ * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple).
+ *
+ * @return: true if the proposed NAT mapping collides with an existing entry.
+ */
+static int
nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
{
@@ -158,10 +220,185 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
*/
struct nf_conntrack_tuple reply;
- nf_ct_invert_tuplepr(&reply, tuple);
+ nf_ct_invert_tuple(&reply, tuple);
return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
}
-EXPORT_SYMBOL(nf_nat_used_tuple);
+
+static bool nf_nat_allow_clash(const struct nf_conn *ct)
+{
+ return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash;
+}
+
+/**
+ * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry
+ * @tuple: proposed NAT binding
+ * @ignored_ct: our (unconfirmed) conntrack entry
+ *
+ * Same as nf_nat_used_tuple, but also check for rare clash in reverse
+ * direction. Should be called only when @tuple has not been altered, i.e.
+ * @ignored_conntrack will not be subject to NAT.
+ *
+ * @return: true if the proposed NAT mapping collides with existing entry.
+ */
+static noinline bool
+nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_ct)
+{
+ static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST;
+ const struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ if (!nf_nat_used_tuple(tuple, ignored_ct))
+ return false;
+
+ if (!nf_nat_allow_clash(ignored_ct))
+ return true;
+
+ /* Initial choice clashes with existing conntrack.
+ * Check for (rare) reverse collision.
+ *
+ * This can happen when new packets are received in both directions
+ * at the exact same time on different CPUs.
+ *
+ * Without SMP, first packet creates new conntrack entry and second
+ * packet is resolved as established reply packet.
+ *
+ * With parallel processing, both packets could be picked up as
+ * new and both get their own ct entry allocated.
+ *
+ * If ignored_conntrack and colliding ct are not subject to NAT then
+ * pretend the tuple is available and let later clash resolution
+ * handle this at insertion time.
+ *
+ * Without it, the 'reply' packet has its source port rewritten
+ * by nat engine.
+ */
+ if (READ_ONCE(ignored_ct->status) & uses_nat)
+ return true;
+
+ net = nf_ct_net(ignored_ct);
+ zone = nf_ct_zone(ignored_ct);
+
+ thash = nf_conntrack_find_get(net, zone, tuple);
+ if (unlikely(!thash)) {
+ struct nf_conntrack_tuple reply;
+
+ nf_ct_invert_tuple(&reply, tuple);
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash) /* clashing entry went away */
+ return false;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ /* NB: IP_CT_DIR_ORIGINAL should be impossible because
+ * nf_nat_used_tuple() handles origin collisions.
+ *
+ * Handle remote chance other CPU confirmed its ct right after.
+ */
+ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY)
+ goto out;
+
+ /* clashing connection subject to NAT? Retry with new tuple. */
+ if (READ_ONCE(ct->status) & uses_nat)
+ goto out;
+
+ if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+ taken = false;
+ goto out;
+ }
+out:
+ nf_ct_put(ct);
+ return taken;
+}
+
+static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags)
+{
+ static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT |
+ IPS_DYING;
+ static const unsigned long flags_needed = IPS_SRC_NAT;
+ enum tcp_conntrack old_state;
+
+ old_state = READ_ONCE(ct->proto.tcp.state);
+ if (old_state < TCP_CONNTRACK_TIME_WAIT)
+ return false;
+
+ if (flags & flags_refuse)
+ return false;
+
+ return (flags & flags_needed) == flags_needed;
+}
+
+/* reverse direction will send packets to new source, so
+ * make sure such packets are invalid.
+ */
+static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new)
+{
+ return (__s32)(new->proto.tcp.seen[0].td_end -
+ old->proto.tcp.seen[0].td_end) > 0;
+}
+
+static int
+nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple,
+ const struct nf_conn *ignored_conntrack,
+ unsigned int attempts_left)
+{
+ static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD;
+ struct nf_conntrack_tuple_hash *thash;
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple reply;
+ unsigned long flags;
+ struct nf_conn *ct;
+ bool taken = true;
+ struct net *net;
+
+ nf_ct_invert_tuple(&reply, tuple);
+
+ if (attempts_left > NF_NAT_HARDER_THRESH ||
+ tuple->dst.protonum != IPPROTO_TCP ||
+ ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT)
+ return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+
+ /* :ast few attempts to find a free tcp port. Destructive
+ * action: evict colliding if its in timewait state and the
+ * tcp sequence number has advanced past the one used by the
+ * old entry.
+ */
+ net = nf_ct_net(ignored_conntrack);
+ zone = nf_ct_zone(ignored_conntrack);
+
+ thash = nf_conntrack_find_get(net, zone, &reply);
+ if (!thash)
+ return false;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+
+ if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL)
+ goto out;
+
+ if (WARN_ON_ONCE(ct == ignored_conntrack))
+ goto out;
+
+ flags = READ_ONCE(ct->status);
+ if (!nf_nat_may_kill(ct, flags))
+ goto out;
+
+ if (!nf_seq_has_advanced(ct, ignored_conntrack))
+ goto out;
+
+ /* Even if we can evict do not reuse if entry is offloaded. */
+ if (nf_ct_kill(ct))
+ taken = flags & flags_offload;
+out:
+ nf_ct_put(ct);
+ return taken;
+}
static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t,
const struct nf_nat_range2 *range)
@@ -183,7 +420,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
__be16 port;
switch (tuple->dst.protonum) {
- case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
@@ -191,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- case IPPROTO_DCCP:
case IPPROTO_SCTP:
if (maniptype == NF_NAT_MANIP_SRC)
port = tuple->src.u.all;
@@ -208,7 +444,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
/* If we source map this tuple so reply looks like reply_tuple, will
* that meet the constraints of range.
*/
-static int in_range(const struct nf_conntrack_tuple *tuple,
+static int nf_in_range(const struct nf_conntrack_tuple *tuple,
const struct nf_nat_range2 *range)
{
/* If we are supposed to map IPs, then we must be in the
@@ -245,7 +481,7 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range2 *range)
{
- unsigned int h = hash_by_src(net, tuple);
+ unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn *ct;
hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
@@ -253,11 +489,11 @@ find_appropriate_src(struct net *net,
net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
- nf_ct_invert_tuplepr(result,
+ nf_ct_invert_tuple(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
- if (in_range(result, range))
+ if (nf_in_range(result, range))
return 1;
}
}
@@ -351,16 +587,20 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
unsigned int range_size, min, max, i, attempts;
__be16 *keyptr;
u16 off;
- static const unsigned int max_attempts = 128;
switch (tuple->dst.protonum) {
- case IPPROTO_ICMP: /* fallthrough */
+ case IPPROTO_ICMP:
case IPPROTO_ICMPV6:
/* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id;
- min = range->min_proto.icmp.id;
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
+ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
+ min = 0;
+ range_size = 65536;
+ } else {
+ min = ntohs(range->min_proto.icmp.id);
+ range_size = ntohs(range->max_proto.icmp.id) -
+ ntohs(range->min_proto.icmp.id) + 1;
+ }
goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
case IPPROTO_GRE:
@@ -383,11 +623,10 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
}
goto find_free_id;
#endif
- case IPPROTO_UDP: /* fallthrough */
- case IPPROTO_UDPLITE: /* fallthrough */
- case IPPROTO_TCP: /* fallthrough */
- case IPPROTO_SCTP: /* fallthrough */
- case IPPROTO_DCCP: /* fallthrough */
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
if (maniptype == NF_NAT_MANIP_SRC)
keyptr = &tuple->src.u.all;
else
@@ -428,12 +667,15 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
find_free_id:
if (range->flags & NF_NAT_RANGE_PROTO_OFFSET)
off = (ntohs(*keyptr) - ntohs(range->base_proto.all));
+ else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) ||
+ maniptype != NF_NAT_MANIP_DST)
+ off = get_random_u16();
else
- off = prandom_u32();
+ off = 0;
attempts = range_size;
- if (attempts > max_attempts)
- attempts = max_attempts;
+ if (attempts > NF_NAT_MAX_ATTEMPTS)
+ attempts = NF_NAT_MAX_ATTEMPTS;
/* We are in softirq; doing a search of the entire range risks
* soft lockup when all tuples are already used.
@@ -444,14 +686,14 @@ find_free_id:
another_round:
for (i = 0; i < attempts; i++, off++) {
*keyptr = htons(min + off % range_size);
- if (!nf_nat_used_tuple(tuple, ct))
+ if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i))
return;
}
if (attempts >= range_size || attempts < 16)
return;
attempts /= 2;
- off = prandom_u32();
+ off = get_random_u16();
goto another_round;
}
@@ -460,7 +702,7 @@ another_round:
* and NF_INET_LOCAL_OUT, we change the destination to map into the
* range. It might not be possible to get a unique tuple, but we try.
* At worst (or if we race), we will end up with a final duplicate in
- * __ip_conntrack_confirm and drop the packet. */
+ * __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
@@ -484,8 +726,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* try the original tuple first */
- if (in_range(orig_tuple, range)) {
- if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ if (nf_in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple_new(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
@@ -510,8 +752,8 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
- &range->min_proto,
- &range->max_proto) &&
+ &range->min_proto,
+ &range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
return;
@@ -560,8 +802,8 @@ nf_nat_setup_info(struct nf_conn *ct,
* manipulations (future optimization: if num_manips == 0,
* orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
*/
- nf_ct_invert_tuplepr(&curr_tuple,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ nf_ct_invert_tuple(&curr_tuple,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
@@ -569,7 +811,7 @@ nf_nat_setup_info(struct nf_conn *ct,
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
- nf_ct_invert_tuplepr(&reply, &new_tuple);
+ nf_ct_invert_tuple(&reply, &new_tuple);
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
@@ -587,7 +829,7 @@ nf_nat_setup_info(struct nf_conn *ct,
unsigned int srchash;
spinlock_t *lock;
- srchash = hash_by_src(net,
+ srchash = hash_by_src(net, nf_ct_zone(ct),
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
@@ -632,23 +874,6 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
}
EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
-static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
- enum nf_nat_manip_type mtype,
- enum ip_conntrack_dir dir)
-{
- const struct nf_nat_l3proto *l3proto;
- struct nf_conntrack_tuple target;
-
- /* We are aiming to look like inverse of other direction. */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-
- l3proto = __nf_nat_l3proto_find(target.src.l3num);
- if (!l3proto->manip_pkt(skb, 0, &target, mtype))
- return NF_DROP;
-
- return NF_ACCEPT;
-}
-
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
@@ -677,6 +902,16 @@ unsigned int nf_nat_packet(struct nf_conn *ct,
}
EXPORT_SYMBOL_GPL(nf_nat_packet);
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+ if (state->hook == NF_INET_POST_ROUTING &&
+ netif_is_l3_master(state->out))
+ return true;
+#endif
+ return false;
+}
+
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -693,7 +928,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb,
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
- if (!ct)
+ if (!ct || in_vrf_postrouting(state))
return NF_ACCEPT;
nat = nfct_nat(ct);
@@ -769,11 +1004,11 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data)
return i->status & IPS_NAT_MASK ? 1 : 0;
}
-static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
+static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
unsigned int h;
- h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
hlist_del_rcu(&ct->nat_bysource);
spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
@@ -791,7 +1026,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* will delete entry from already-freed table.
*/
if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
- __nf_nat_cleanup_conntrack(ct);
+ nf_nat_cleanup_conntrack(ct);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -799,47 +1034,6 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
return 0;
}
-static void nf_nat_l3proto_clean(u8 l3proto)
-{
- struct nf_nat_proto_clean clean = {
- .l3proto = l3proto,
- };
-
- nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
-}
-
-int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
-{
- RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
-
-void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
-{
- mutex_lock(&nf_nat_proto_mutex);
- RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
- mutex_unlock(&nf_nat_proto_mutex);
- synchronize_rcu();
-
- nf_nat_l3proto_clean(l3proto->l3proto);
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
-
-/* No one using conntrack by the time this called. */
-static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
-{
- if (ct->status & IPS_SRC_NAT_DONE)
- __nf_nat_cleanup_conntrack(ct);
-}
-
-static struct nf_ct_ext_type nat_extend __read_mostly = {
- .len = sizeof(struct nf_conn_nat),
- .align = __alignof__(struct nf_conn_nat),
- .destroy = nf_nat_cleanup_conntrack,
- .id = NF_CT_EXT_NAT,
-};
-
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
#include <linux/netfilter/nfnetlink.h>
@@ -872,8 +1066,8 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr,
struct nlattr *tb[CTA_PROTONAT_MAX+1];
int err;
- err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr,
- protonat_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr,
+ protonat_nla_policy, NULL);
if (err < 0)
return err;
@@ -888,22 +1082,65 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
[CTA_NAT_PROTO] = { .type = NLA_NESTED },
};
+static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range2 *range)
+{
+ if (tb[CTA_NAT_V4_MINIP]) {
+ range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP],
+ range->min_addr.ip);
+
+ return 0;
+}
+
+static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
+ struct nf_nat_range2 *range)
+{
+ if (tb[CTA_NAT_V6_MINIP]) {
+ nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
+ sizeof(struct in6_addr));
+ range->flags |= NF_NAT_RANGE_MAP_IPS;
+ }
+
+ if (tb[CTA_NAT_V6_MAXIP])
+ nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP],
+ sizeof(struct in6_addr));
+ else
+ range->max_addr = range->min_addr;
+
+ return 0;
+}
+
static int
nfnetlink_parse_nat(const struct nlattr *nat,
- const struct nf_conn *ct, struct nf_nat_range2 *range,
- const struct nf_nat_l3proto *l3proto)
+ const struct nf_conn *ct, struct nf_nat_range2 *range)
{
struct nlattr *tb[CTA_NAT_MAX+1];
int err;
memset(range, 0, sizeof(*range));
- err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat,
+ nat_nla_policy, NULL);
if (err < 0)
return err;
- err = l3proto->nlattr_to_range(tb, range);
- if (err < 0)
+ switch (nf_ct_l3num(ct)) {
+ case NFPROTO_IPV4:
+ err = nf_nat_ipv4_nlattr_to_range(tb, range);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_nat_ipv6_nlattr_to_range(tb, range);
+ break;
+ default:
+ err = -EPROTONOSUPPORT;
+ break;
+ }
+
+ if (err)
return err;
if (!tb[CTA_NAT_PROTO])
@@ -919,7 +1156,6 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
const struct nlattr *attr)
{
struct nf_nat_range2 range;
- const struct nf_nat_l3proto *l3proto;
int err;
/* Should not happen, restricted to creating new conntracks
@@ -928,18 +1164,11 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
return -EEXIST;
- /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
- * attach the null binding, otherwise this may oops.
- */
- l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
- if (l3proto == NULL)
- return -EAGAIN;
-
/* No NAT information has been passed, allocate the null-binding */
if (attr == NULL)
return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0;
- err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
+ err = nfnetlink_parse_nat(attr, ct, &range);
if (err < 0)
return err;
@@ -960,7 +1189,7 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
.expectfn = nf_nat_follow_master,
};
-int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
{
struct nat_net *nat_net = net_generic(net, nat_net_id);
@@ -970,14 +1199,12 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
struct nf_hook_ops *nat_ops;
int i, ret;
- if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
+ if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
return -EINVAL;
- nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+ nat_proto_net = &nat_net->nat_proto_net[pf];
for (i = 0; i < ops_count; i++) {
- if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
- return -EINVAL;
if (orig_nat_ops[i].hooknum == hooknum) {
hooknum = i;
break;
@@ -991,7 +1218,7 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
if (!nat_proto_net->nat_hook_ops) {
WARN_ON(nat_proto_net->users != 0);
- nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
+ nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL);
if (!nat_ops) {
mutex_unlock(&nf_nat_proto_mutex);
return -ENOMEM;
@@ -1036,10 +1263,9 @@ int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
mutex_unlock(&nf_nat_proto_mutex);
return ret;
}
-EXPORT_SYMBOL_GPL(nf_nat_register_fn);
-void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
- unsigned int ops_count)
+void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
+ unsigned int ops_count)
{
struct nat_net *nat_net = net_generic(net, nat_net_id);
struct nf_nat_hooks_net *nat_proto_net;
@@ -1048,10 +1274,10 @@ void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
int hooknum = ops->hooknum;
int i;
- if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
+ if (pf >= ARRAY_SIZE(nat_net->nat_proto_net))
return;
- nat_proto_net = &nat_net->nat_proto_net[ops->pf];
+ nat_proto_net = &nat_net->nat_proto_net[pf];
mutex_lock(&nf_nat_proto_mutex);
if (WARN_ON(nat_proto_net->users == 0))
@@ -1085,19 +1311,18 @@ void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
unlock:
mutex_unlock(&nf_nat_proto_mutex);
}
-EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
static struct pernet_operations nat_net_ops = {
.id = &nat_net_id,
.size = sizeof(struct nat_net),
};
-static struct nf_nat_hook nat_hook = {
+static const struct nf_nat_hook nat_hook = {
.parse_nat_setup = nfnetlink_parse_nat_setup,
#ifdef CONFIG_XFRM
.decode_session = __nf_nat_decode_session,
#endif
- .manip_pkt = nf_nat_manip_pkt,
+ .remove_nat_bysrc = nf_nat_cleanup_conntrack,
};
static int __init nf_nat_init(void)
@@ -1113,19 +1338,12 @@ static int __init nf_nat_init(void)
if (!nf_nat_bysource)
return -ENOMEM;
- ret = nf_ct_extend_register(&nat_extend);
- if (ret < 0) {
- kvfree(nf_nat_bysource);
- pr_err("Unable to register extension\n");
- return ret;
- }
-
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_init(&nf_nat_locks[i]);
ret = register_pernet_subsys(&nat_net_ops);
if (ret < 0) {
- nf_ct_extend_unregister(&nat_extend);
+ kvfree(nf_nat_bysource);
return ret;
}
@@ -1134,7 +1352,16 @@ static int __init nf_nat_init(void)
WARN_ON(nf_nat_hook != NULL);
RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
- return 0;
+ ret = register_nf_nat_bpf();
+ if (ret < 0) {
+ RCU_INIT_POINTER(nf_nat_hook, NULL);
+ nf_ct_helper_expectfn_unregister(&follow_master_nat);
+ synchronize_net();
+ unregister_pernet_subsys(&nat_net_ops);
+ kvfree(nf_nat_bysource);
+ }
+
+ return ret;
}
static void __exit nf_nat_cleanup(void)
@@ -1143,7 +1370,6 @@ static void __exit nf_nat_cleanup(void)
nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
- nf_ct_extend_unregister(&nat_extend);
nf_ct_helper_expectfn_unregister(&follow_master_nat);
RCU_INIT_POINTER(nf_nat_hook, NULL);
@@ -1153,6 +1379,7 @@ static void __exit nf_nat_cleanup(void)
}
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Network address translation core");
module_init(nf_nat_init);
module_exit(nf_nat_cleanup);
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index 5063cbf1689c..c92a436d9c48 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* FTP extension for TCP NAT alteration. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -21,13 +18,18 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_ftp.h>
+#define NAT_HELPER_NAME "ftp"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp NAT helper");
-MODULE_ALIAS("ip_nat_ftp");
+MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME);
/* FIXME: Time out? --RR */
+static struct nf_conntrack_nat_helper nat_helper_ftp =
+ NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME);
+
static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type,
char *buffer, size_t buflen,
union nf_inet_addr *addr, u16 port)
@@ -84,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
- nf_ct_helper_log(skb, ct, "all ports in use");
+ nf_ct_helper_log(skb, exp->master, "all ports in use");
return NF_DROP;
}
@@ -124,6 +113,7 @@ out:
static void __exit nf_nat_ftp_fini(void)
{
+ nf_nat_helper_unregister(&nat_helper_ftp);
RCU_INIT_POINTER(nf_nat_ftp_hook, NULL);
synchronize_rcu();
}
@@ -131,6 +121,7 @@ static void __exit nf_nat_ftp_fini(void)
static int __init nf_nat_ftp_init(void)
{
BUG_ON(nf_nat_ftp_hook != NULL);
+ nf_nat_helper_register(&nat_helper_ftp);
RCU_INIT_POINTER(nf_nat_ftp_hook, nf_nat_ftp);
return 0;
}
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 38793b95d9bc..bf591e6af005 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* nf_nat_helper.c - generic support functions for NAT helpers
*
* (C) 2000-2002 Harald Welte <laforge@netfilter.org>
* (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2007-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/gfp.h>
@@ -22,9 +19,6 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
/* Frobs data inside this packet, which is linear. */
@@ -98,11 +92,10 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len, bool adjust)
{
- const struct nf_nat_l3proto *l3proto;
struct tcphdr *tcph;
int oldlen, datalen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
@@ -118,9 +111,8 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
datalen = skb->len - protoff;
- l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
- l3proto->csum_recalc(skb, IPPROTO_TCP, tcph, &tcph->check,
- datalen, oldlen);
+ nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_TCP,
+ tcph, &tcph->check, datalen, oldlen);
if (adjust && rep_len != match_len)
nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
@@ -150,11 +142,10 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
const char *rep_buffer,
unsigned int rep_len)
{
- const struct nf_nat_l3proto *l3proto;
struct udphdr *udph;
int datalen, oldlen;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return false;
if (rep_len > match_len &&
@@ -176,9 +167,8 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
return true;
- l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
- l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check,
- datalen, oldlen);
+ nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_UDP,
+ udph, &udph->check, datalen, oldlen);
return true;
}
@@ -208,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct,
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
}
EXPORT_SYMBOL(nf_nat_follow_master);
+
+u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port)
+{
+ static const unsigned int max_attempts = 128;
+ int range, attempts_left;
+ u16 min = port;
+
+ range = USHRT_MAX - port;
+ attempts_left = range;
+
+ if (attempts_left > max_attempts)
+ attempts_left = max_attempts;
+
+ /* Try to get same port: if not, try to change it. */
+ for (;;) {
+ int res;
+
+ exp->tuple.dst.u.tcp.port = htons(port);
+ res = nf_ct_expect_related(exp, 0);
+ if (res == 0)
+ return port;
+
+ if (res != -EBUSY || (--attempts_left < 0))
+ break;
+
+ port = min + get_random_u32_below(range);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_nat_exp_find_port);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index 3aa35a43100d..19c4fcc60c50 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* IRC extension for TCP NAT alteration.
*
* (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
* (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
* based on a copy of RR's ip_nat_ftp.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -23,10 +19,15 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <linux/netfilter/nf_conntrack_irc.h>
+#define NAT_HELPER_NAME "irc"
+
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("IRC (DCC) NAT helper");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_irc");
+MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME);
+
+static struct nf_conntrack_nat_helper nat_helper_irc =
+ NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME);
static unsigned int help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
@@ -47,20 +48,8 @@ static unsigned int help(struct sk_buff *skb,
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
- for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp,
+ ntohs(exp->saved_proto.tcp.port));
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use");
return NF_DROP;
@@ -96,6 +85,7 @@ static unsigned int help(struct sk_buff *skb,
static void __exit nf_nat_irc_fini(void)
{
+ nf_nat_helper_unregister(&nat_helper_irc);
RCU_INIT_POINTER(nf_nat_irc_hook, NULL);
synchronize_rcu();
}
@@ -103,6 +93,7 @@ static void __exit nf_nat_irc_fini(void)
static int __init nf_nat_irc_init(void)
{
BUG_ON(nf_nat_irc_hook != NULL);
+ nf_nat_helper_register(&nat_helper_irc);
RCU_INIT_POINTER(nf_nat_irc_hook, help);
return 0;
}
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
new file mode 100644
index 000000000000..1a506b0c6511
--- /dev/null
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/inetdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/netfilter/nf_nat_masquerade.h>
+
+struct masq_dev_work {
+ struct work_struct work;
+ struct net *net;
+ netns_tracker ns_tracker;
+ union nf_inet_addr addr;
+ int ifindex;
+ int (*iter)(struct nf_conn *i, void *data);
+};
+
+#define MAX_MASQ_WORKER_COUNT 16
+
+static DEFINE_MUTEX(masq_mutex);
+static unsigned int masq_refcnt __read_mostly;
+static atomic_t masq_worker_count __read_mostly;
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+ const struct nf_nat_range2 *range,
+ const struct net_device *out)
+{
+ struct nf_conn *ct;
+ struct nf_conn_nat *nat;
+ enum ip_conntrack_info ctinfo;
+ struct nf_nat_range2 newrange;
+ const struct rtable *rt;
+ __be32 newsrc, nh;
+
+ WARN_ON(hooknum != NF_INET_POST_ROUTING);
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY)));
+
+ /* Source address is 0.0.0.0 - locally generated packet that is
+ * probably not supposed to be masqueraded.
+ */
+ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
+ return NF_ACCEPT;
+
+ rt = skb_rtable(skb);
+ nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
+ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
+ if (!newsrc) {
+ pr_info("%s ate my IP address\n", out->name);
+ return NF_DROP;
+ }
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat)
+ nat->masq_index = out->ifindex;
+
+ /* Transfer from original range. */
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.ip = newsrc;
+ newrange.max_addr.ip = newsrc;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ /* Hand modified range to generic setup. */
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
+
+static void iterate_cleanup_work(struct work_struct *work)
+{
+ struct nf_ct_iter_data iter_data = {};
+ struct masq_dev_work *w;
+
+ w = container_of(work, struct masq_dev_work, work);
+
+ iter_data.net = w->net;
+ iter_data.data = (void *)w;
+ nf_ct_iterate_cleanup_net(w->iter, &iter_data);
+
+ put_net_track(w->net, &w->ns_tracker);
+ kfree(w);
+ atomic_dec(&masq_worker_count);
+ module_put(THIS_MODULE);
+}
+
+/* Iterate conntrack table in the background and remove conntrack entries
+ * that use the device/address being removed.
+ *
+ * In case too many work items have been queued already or memory allocation
+ * fails iteration is skipped, conntrack entries will time out eventually.
+ */
+static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
+ int ifindex,
+ int (*iter)(struct nf_conn *i, void *data),
+ gfp_t gfp_flags)
+{
+ struct masq_dev_work *w;
+
+ if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
+ return;
+
+ net = maybe_get_net(net);
+ if (!net)
+ return;
+
+ if (!try_module_get(THIS_MODULE))
+ goto err_module;
+
+ w = kzalloc(sizeof(*w), gfp_flags);
+ if (w) {
+ /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
+ atomic_inc(&masq_worker_count);
+
+ INIT_WORK(&w->work, iterate_cleanup_work);
+ w->ifindex = ifindex;
+ w->net = net;
+ netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
+ w->iter = iter;
+ if (addr)
+ w->addr = *addr;
+ schedule_work(&w->work);
+ return;
+ }
+
+ module_put(THIS_MODULE);
+ err_module:
+ put_net(net);
+}
+
+static int device_cmp(struct nf_conn *i, void *arg)
+{
+ const struct nf_conn_nat *nat = nfct_nat(i);
+ const struct masq_dev_work *w = arg;
+
+ if (!nat)
+ return 0;
+ return nat->masq_index == w->ifindex;
+}
+
+static int masq_device_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN) {
+ /* Device was downed. Search entire table for
+ * conntracks which were associated with that device,
+ * and forget them.
+ */
+
+ nf_nat_masq_schedule(net, NULL, dev->ifindex,
+ device_cmp, GFP_KERNEL);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int inet_cmp(struct nf_conn *ct, void *ptr)
+{
+ struct nf_conntrack_tuple *tuple;
+ struct masq_dev_work *w = ptr;
+
+ if (!device_cmp(ct, ptr))
+ return 0;
+
+ tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
+}
+
+static int masq_inet_event(struct notifier_block *this,
+ unsigned long event,
+ void *ptr)
+{
+ const struct in_ifaddr *ifa = ptr;
+ const struct in_device *idev;
+ const struct net_device *dev;
+ union nf_inet_addr addr;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ /* The masq_dev_notifier will catch the case of the device going
+ * down. So if the inetdev is dead and being destroyed we have
+ * no work to do. Otherwise this is an individual address removal
+ * and we have to perform the flush.
+ */
+ idev = ifa->ifa_dev;
+ if (idev->dead)
+ return NOTIFY_DONE;
+
+ memset(&addr, 0, sizeof(addr));
+
+ addr.ip = ifa->ifa_address;
+
+ dev = idev->dev;
+ nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
+ inet_cmp, GFP_KERNEL);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+ .notifier_call = masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
+ const struct in6_addr *daddr, unsigned int srcprefs,
+ struct in6_addr *saddr)
+{
+#ifdef CONFIG_IPV6_MODULE
+ const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+ if (!v6_ops)
+ return -EHOSTUNREACH;
+
+ return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr);
+#else
+ return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr);
+#endif
+}
+
+unsigned int
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+ const struct net_device *out)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_nat *nat;
+ struct in6_addr src;
+ struct nf_conn *ct;
+ struct nf_nat_range2 newrange;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED_REPLY)));
+
+ if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out,
+ &ipv6_hdr(skb)->daddr, 0, &src) < 0)
+ return NF_DROP;
+
+ nat = nf_ct_nat_ext_add(ct);
+ if (nat)
+ nat->masq_index = out->ifindex;
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr.in6 = src;
+ newrange.max_addr.in6 = src;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
+
+/* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
+ *
+ * Defer it to the system workqueue.
+ *
+ * As we can have 'a lot' of inet_events (depending on amount of ipv6
+ * addresses being deleted), we also need to limit work item queue.
+ */
+static int masq_inet6_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = ptr;
+ const struct net_device *dev;
+ union nf_inet_addr addr;
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ dev = ifa->idev->dev;
+
+ memset(&addr, 0, sizeof(addr));
+
+ addr.in6 = ifa->addr;
+
+ nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
+ GFP_ATOMIC);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_inet6_notifier = {
+ .notifier_call = masq_inet6_event,
+};
+
+static int nf_nat_masquerade_ipv6_register_notifier(void)
+{
+ return register_inet6addr_notifier(&masq_inet6_notifier);
+}
+#else
+static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; }
+#endif
+
+int nf_nat_masquerade_inet_register_notifiers(void)
+{
+ int ret = 0;
+
+ mutex_lock(&masq_mutex);
+ if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) {
+ ret = -EOVERFLOW;
+ goto out_unlock;
+ }
+
+ /* check if the notifier was already set */
+ if (++masq_refcnt > 1)
+ goto out_unlock;
+
+ /* Register for device down reports */
+ ret = register_netdevice_notifier(&masq_dev_notifier);
+ if (ret)
+ goto err_dec;
+ /* Register IP address change reports */
+ ret = register_inetaddr_notifier(&masq_inet_notifier);
+ if (ret)
+ goto err_unregister;
+
+ ret = nf_nat_masquerade_ipv6_register_notifier();
+ if (ret)
+ goto err_unreg_inet;
+
+ mutex_unlock(&masq_mutex);
+ return ret;
+err_unreg_inet:
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+err_unregister:
+ unregister_netdevice_notifier(&masq_dev_notifier);
+err_dec:
+ masq_refcnt--;
+out_unlock:
+ mutex_unlock(&masq_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers);
+
+void nf_nat_masquerade_inet_unregister_notifiers(void)
+{
+ mutex_lock(&masq_mutex);
+ /* check if the notifiers still have clients */
+ if (--masq_refcnt > 0)
+ goto out_unlock;
+
+ unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&masq_inet6_notifier);
+#endif
+out_unlock:
+ mutex_unlock(&masq_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
new file mode 100644
index 000000000000..0f9a559f6207
--- /dev/null
+++ b/net/netfilter/nf_nat_ovs.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_nat.h>
+
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range,
+ enum nf_nat_manip_type maniptype)
+{
+ __be16 proto = skb_protocol(skb, true);
+ int hooknum, err = NF_ACCEPT;
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (proto == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto out;
+ } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto out;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ fallthrough;
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto out;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto out;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+out:
+ if (err == NF_ACCEPT)
+ *action |= BIT(maniptype);
+
+ return err;
+}
+
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, int *action,
+ const struct nf_nat_range2 *range, bool commit)
+{
+ enum nf_nat_manip_type maniptype;
+ int err, ct_action = *action;
+
+ *action = 0;
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_DROP; /* Can't NAT. */
+
+ if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+ (ctinfo != IP_CT_RELATED || commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (ct_action & BIT(NF_NAT_MANIP_DST)) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT;
+ }
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype);
+ if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+ if (ct->status & IPS_SRC_NAT) {
+ if (maniptype == NF_NAT_MANIP_SRC)
+ maniptype = NF_NAT_MANIP_DST;
+ else
+ maniptype = NF_NAT_MANIP_SRC;
+
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, range,
+ maniptype);
+ } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL,
+ NF_NAT_MANIP_SRC);
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat);
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index f83bf9d8c9f5..b14a434b9561 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -20,13 +17,26 @@
#include <linux/netfilter.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
+
+#include <linux/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/checksum.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/xfrm.h>
+#include <net/ipv6.h>
+
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+
+static void nf_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype);
static void
__udp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, struct udphdr *hdr,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype, bool do_csum)
@@ -43,8 +53,7 @@ __udp_manip_pkt(struct sk_buff *skb,
portptr = &hdr->dest;
}
if (do_csum) {
- l3proto->csum_update(skb, iphdroff, &hdr->check,
- tuple, maniptype);
+ nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport,
false);
if (!hdr->check)
@@ -54,26 +63,22 @@ __udp_manip_pkt(struct sk_buff *skb,
}
static bool udp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
struct udphdr *hdr;
- bool do_csum;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
- do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL;
+ __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);
- __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, do_csum);
return true;
}
static bool udplite_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
@@ -81,18 +86,17 @@ static bool udplite_manip_pkt(struct sk_buff *skb,
#ifdef CONFIG_NF_CT_PROTO_UDPLITE
struct udphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct udphdr *)(skb->data + hdroff);
- __udp_manip_pkt(skb, l3proto, iphdroff, hdr, tuple, maniptype, true);
+ __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true);
#endif
return true;
}
static bool
sctp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
@@ -108,7 +112,7 @@ sctp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(*hdr))
hdrsize = sizeof(*hdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct sctphdr *)(skb->data + hdroff);
@@ -135,7 +139,6 @@ sctp_manip_pkt(struct sk_buff *skb,
static bool
tcp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
@@ -150,7 +153,7 @@ tcp_manip_pkt(struct sk_buff *skb,
if (skb->len >= hdroff + sizeof(struct tcphdr))
hdrsize = sizeof(struct tcphdr);
- if (!skb_make_writable(skb, hdroff + hdrsize))
+ if (skb_ensure_writable(skb, hdroff + hdrsize))
return false;
hdr = (struct tcphdr *)(skb->data + hdroff);
@@ -171,66 +174,36 @@ tcp_manip_pkt(struct sk_buff *skb,
if (hdrsize < sizeof(*hdr))
return true;
- l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
+ nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype);
inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false);
return true;
}
static bool
-dccp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct dccp_hdr *hdr;
- __be16 *portptr, oldport, newport;
- int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
- if (skb->len >= hdroff + sizeof(struct dccp_hdr))
- hdrsize = sizeof(struct dccp_hdr);
-
- if (!skb_make_writable(skb, hdroff + hdrsize))
- return false;
-
- hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- newport = tuple->src.u.dccp.port;
- portptr = &hdr->dccph_sport;
- } else {
- newport = tuple->dst.u.dccp.port;
- portptr = &hdr->dccph_dport;
- }
-
- oldport = *portptr;
- *portptr = newport;
-
- if (hdrsize < sizeof(*hdr))
- return true;
-
- l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum,
- tuple, maniptype);
- inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
- false);
-#endif
- return true;
-}
-
-static bool
icmp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
struct icmphdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmphdr *)(skb->data + hdroff);
+ switch (hdr->type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ case ICMP_TIMESTAMP:
+ case ICMP_TIMESTAMPREPLY:
+ case ICMP_INFO_REQUEST:
+ case ICMP_INFO_REPLY:
+ case ICMP_ADDRESS:
+ case ICMP_ADDRESSREPLY:
+ break;
+ default:
+ return true;
+ }
inet_proto_csum_replace2(&hdr->checksum, skb,
hdr->un.echo.id, tuple->src.u.icmp.id, false);
hdr->un.echo.id = tuple->src.u.icmp.id;
@@ -239,19 +212,17 @@ icmp_manip_pkt(struct sk_buff *skb,
static bool
icmpv6_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
struct icmp6hdr *hdr;
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmp6hdr *)(skb->data + hdroff);
- l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
- tuple, maniptype);
+ nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype);
if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
@@ -265,7 +236,6 @@ icmpv6_manip_pkt(struct sk_buff *skb,
/* manipulate a GRE packet according to maniptype */
static bool
gre_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
@@ -276,7 +246,7 @@ gre_manip_pkt(struct sk_buff *skb,
/* pgreh includes two optional 32bit fields which are not required
* to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
+ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8))
return false;
greh = (void *)skb->data + hdroff;
@@ -304,40 +274,846 @@ gre_manip_pkt(struct sk_buff *skb,
return true;
}
-bool nf_nat_l4proto_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
+static bool l4proto_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
switch (tuple->dst.protonum) {
case IPPROTO_TCP:
- return tcp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return tcp_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
case IPPROTO_UDP:
- return udp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return udp_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
case IPPROTO_UDPLITE:
- return udplite_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return udplite_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
case IPPROTO_SCTP:
- return sctp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return sctp_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
case IPPROTO_ICMP:
- return icmp_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return icmp_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
case IPPROTO_ICMPV6:
- return icmpv6_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return icmpv6_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
- case IPPROTO_DCCP:
- return dccp_manip_pkt(skb, l3proto, iphdroff, hdroff,
- tuple, maniptype);
case IPPROTO_GRE:
- return gre_manip_pkt(skb, l3proto, iphdroff, hdroff,
+ return gre_manip_pkt(skb, iphdroff, hdroff,
tuple, maniptype);
}
/* If we don't know protocol -- no error, pass it unmodified. */
return true;
}
-EXPORT_SYMBOL_GPL(nf_nat_l4proto_manip_pkt);
+
+static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph;
+ unsigned int hdroff;
+
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
+ return false;
+
+ iph = (void *)skb->data + iphdroff;
+ hdroff = iphdroff + iph->ihl * 4;
+
+ if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
+ return false;
+ iph = (void *)skb->data + iphdroff;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
+ iph->saddr = target->src.u3.ip;
+ } else {
+ csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
+ iph->daddr = target->dst.u3.ip;
+ }
+ return true;
+}
+
+static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
+ unsigned int iphdroff,
+ const struct nf_conntrack_tuple *target,
+ enum nf_nat_manip_type maniptype)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *ipv6h;
+ __be16 frag_off;
+ int hdroff;
+ u8 nexthdr;
+
+ if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h)))
+ return false;
+
+ ipv6h = (void *)skb->data + iphdroff;
+ nexthdr = ipv6h->nexthdr;
+ hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h),
+ &nexthdr, &frag_off);
+ if (hdroff < 0)
+ goto manip_addr;
+
+ if ((frag_off & htons(~0x7)) == 0 &&
+ !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
+ return false;
+
+ /* must reload, offset might have changed */
+ ipv6h = (void *)skb->data + iphdroff;
+
+manip_addr:
+ if (maniptype == NF_NAT_MANIP_SRC)
+ ipv6h->saddr = target->src.u3.in6;
+ else
+ ipv6h->daddr = target->dst.u3.in6;
+
+#endif
+ return true;
+}
+
+unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
+ enum nf_nat_manip_type mtype,
+ enum ip_conntrack_dir dir)
+{
+ struct nf_conntrack_tuple target;
+
+ /* We are aiming to look like inverse of other direction. */
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+
+ switch (target.src.l3num) {
+ case NFPROTO_IPV6:
+ if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype))
+ return NF_ACCEPT;
+ break;
+ case NFPROTO_IPV4:
+ if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype))
+ return NF_ACCEPT;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return NF_DROP;
+}
+
+static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
+ __be32 oldip, newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = iph->saddr;
+ newip = t->src.u3.ip;
+ } else {
+ oldip = iph->daddr;
+ newip = t->dst.u3.ip;
+ }
+ inet_proto_csum_replace4(check, skb, oldip, newip, true);
+}
+
+static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
+ const struct in6_addr *oldip, *newip;
+
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ oldip = &ipv6h->saddr;
+ newip = &t->src.u3.in6;
+ } else {
+ oldip = &ipv6h->daddr;
+ newip = &t->dst.u3.in6;
+ }
+ inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
+ newip->s6_addr32, true);
+#endif
+}
+
+static void nf_csum_update(struct sk_buff *skb,
+ unsigned int iphdroff, __sum16 *check,
+ const struct nf_conntrack_tuple *t,
+ enum nf_nat_manip_type maniptype)
+{
+ switch (t->src.l3num) {
+ case NFPROTO_IPV4:
+ nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype);
+ return;
+ case NFPROTO_IPV6:
+ nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype);
+ return;
+ }
+}
+
+static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+ proto, 0);
+ } else {
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), true);
+ }
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
+ u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+ (data - (void *)skb->data);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto, 0);
+ } else {
+ inet_proto_csum_replace2(check, skb,
+ htons(oldlen), htons(datalen), true);
+ }
+}
+#endif
+
+void nf_nat_csum_recalc(struct sk_buff *skb,
+ u8 nfproto, u8 proto, void *data, __sum16 *check,
+ int datalen, int oldlen)
+{
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ nf_nat_ipv4_csum_recalc(skb, proto, data, check,
+ datalen, oldlen);
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ nf_nat_ipv6_csum_recalc(skb, proto, data, check,
+ datalen, oldlen);
+ return;
+#endif
+ }
+
+ WARN_ON_ONCE(1);
+}
+
+int nf_nat_icmp_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum)
+{
+ struct {
+ struct icmphdr icmp;
+ struct iphdr ip;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ unsigned int hdrlen = ip_hdrlen(skb);
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
+
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp.type == ICMP_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
+ &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ /* Reloading "inside" here since manip_pkt may reallocate */
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp.checksum = 0;
+ inside->icmp.checksum =
+ csum_fold(skb_checksum(skb, hdrlen,
+ skb->len - hdrlen, 0));
+ }
+
+ /* Change outer to look like the reply to an incoming packet */
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+ target.dst.protonum = IPPROTO_ICMP;
+ if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
+
+static unsigned int
+nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ state->hook))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ }
+
+ return nf_nat_inet_fn(priv, skb, state);
+}
+
+static unsigned int
+nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ unsigned int ret;
+ __be32 daddr = ip_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+ if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr)
+ skb_dst_drop(skb);
+
+ return ret;
+}
+
+#ifdef CONFIG_XFRM
+static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
+{
+ struct sock *sk = skb->sk;
+ struct dst_entry *dst;
+ unsigned int hh_len;
+ struct flowi fl;
+ int err;
+
+ err = xfrm_decode_session(net, skb, &fl, family);
+ if (err < 0)
+ return err;
+
+ dst = skb_dst(skb);
+ if (dst->xfrm)
+ dst = ((struct xfrm_dst *)dst)->route;
+ if (!dst_hold_safe(dst))
+ return -EHOSTUNREACH;
+
+ if (sk && !net_eq(net, sock_net(sk)))
+ sk = NULL;
+
+ dst = xfrm_lookup(net, dst, &fl, sk, 0);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ /* Change in oif may mean change in hh_len. */
+ hh_len = skb_dst(skb)->dev->hard_header_len;
+ if (skb_headroom(skb) < hh_len &&
+ pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
+ return -ENOMEM;
+ return 0;
+}
+#endif
+
+static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport)
+{
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ const struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ return false;
+ }
+
+ dir = CTINFO2DIR(ctinfo);
+ if (dir != IP_CT_DIR_ORIGINAL)
+ return false;
+
+ return ct->tuplehash[!dir].tuple.dst.u.all != sport;
+}
+
+static unsigned int
+nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ __be32 saddr = ip_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* skb has a socket assigned via tcp edemux. We need to check
+ * if nf_nat_ipv4_fn() has mangled the packet in a way that
+ * edemux would not have found this socket.
+ *
+ * This includes both changes to the source address and changes
+ * to the source port, which are both handled by the
+ * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux
+ * might have found a socket for the old (pre-snat) address.
+ */
+ if (saddr != ip_hdr(skb)->saddr ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb); /* TCP edemux obtained wrong socket */
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+#ifdef CONFIG_XFRM
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
+ return ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ ret = nf_nat_ipv4_fn(priv, skb, state);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (ct->tuplehash[dir].tuple.dst.u3.ip !=
+ ct->tuplehash[!dir].tuple.src.u3.ip) {
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+ return ret;
+}
+
+static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_pre_routing,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_out,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv4_local_fn,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv4_local_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC,
+ },
+};
+
+int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops,
+ ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn);
+
+void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn);
+
+#if IS_ENABLED(CONFIG_IPV6)
+int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int hooknum,
+ unsigned int hdrlen)
+{
+ struct {
+ struct icmp6hdr icmp6;
+ struct ipv6hdr ip6;
+ } *inside;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
+ struct nf_conntrack_tuple target;
+ unsigned long statusbit;
+
+ WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
+
+ if (skb_ensure_writable(skb, hdrlen + sizeof(*inside)))
+ return 0;
+ if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
+ return 0;
+
+ inside = (void *)skb->data + hdrlen;
+ if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
+ if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+ return 0;
+ if (ct->status & IPS_NAT_MASK)
+ return 0;
+ }
+
+ if (manip == NF_NAT_MANIP_SRC)
+ statusbit = IPS_SRC_NAT;
+ else
+ statusbit = IPS_DST_NAT;
+
+ /* Invert if this is reply direction */
+ if (dir == IP_CT_DIR_REPLY)
+ statusbit ^= IPS_NAT_MASK;
+
+ if (!(ct->status & statusbit))
+ return 1;
+
+ if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
+ &ct->tuplehash[!dir].tuple, !manip))
+ return 0;
+
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+ inside = (void *)skb->data + hdrlen;
+ inside->icmp6.icmp6_cksum = 0;
+ inside->icmp6.icmp6_cksum =
+ csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ skb->len - hdrlen, IPPROTO_ICMPV6,
+ skb_checksum(skb, hdrlen,
+ skb->len - hdrlen, 0));
+ }
+
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+ target.dst.protonum = IPPROTO_ICMPV6;
+ if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
+
+static unsigned int
+nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ __be16 frag_off;
+ int hdrlen;
+ u8 nexthdr;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ /* Can't track? It's not due to stress, or conntrack would
+ * have dropped it. Hence it's the user's responsibilty to
+ * packet filter it out, or implement conntrack/NAT for that
+ * protocol. 8) --RR
+ */
+ if (!ct)
+ return NF_ACCEPT;
+
+ if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+ state->hook,
+ hdrlen))
+ return NF_DROP;
+ else
+ return NF_ACCEPT;
+ }
+ }
+
+ return nf_nat_inet_fn(priv, skb, state);
+}
+
+static unsigned int
+nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr = ipv6_hdr(skb)->saddr;
+ struct sock *sk = skb->sk;
+ unsigned int ret;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+
+ if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk))
+ return ret;
+
+ /* see nf_nat_ipv4_local_in */
+ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) ||
+ nf_nat_inet_port_was_mangled(skb, sk->sk_dport))
+ skb_orphan(skb);
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ unsigned int ret, verdict;
+ struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+ verdict = ret & NF_VERDICT_MASK;
+ if (verdict != NF_DROP && verdict != NF_STOLEN &&
+ ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+ skb_dst_drop(skb);
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#ifdef CONFIG_XFRM
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ int err;
+#endif
+ unsigned int ret;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+#ifdef CONFIG_XFRM
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
+ return ret;
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3) ||
+ (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
+ ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)) {
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+#endif
+
+ return ret;
+}
+
+static unsigned int
+nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int ret;
+ int err;
+
+ ret = nf_nat_ipv6_fn(priv, skb, state);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+ if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+ &ct->tuplehash[!dir].tuple.src.u3)) {
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#ifdef CONFIG_XFRM
+ else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+ ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
+ ct->tuplehash[dir].tuple.dst.u.all !=
+ ct->tuplehash[!dir].tuple.src.u.all) {
+ err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+#endif
+ }
+
+ return ret;
+}
+
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_out,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+ /* Before packet filtering, change destination */
+ {
+ .hook = nf_nat_ipv6_local_fn,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST,
+ },
+ /* After packet filtering, change source */
+ {
+ .hook = nf_nat_ipv6_local_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC,
+ },
+};
+
+int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops,
+ ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn);
+
+void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn);
+#endif /* CONFIG_IPV6 */
+
+#if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT)
+int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(ops->pf != NFPROTO_INET))
+ return -EINVAL;
+
+ ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops,
+ ARRAY_SIZE(nf_nat_ipv6_ops));
+ if (ret)
+ return ret;
+
+ ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops,
+ ARRAY_SIZE(nf_nat_ipv4_ops));
+ if (ret)
+ nf_nat_unregister_fn(net, NFPROTO_IPV6, ops,
+ ARRAY_SIZE(nf_nat_ipv6_ops));
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn);
+
+void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
+ nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
+}
+EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn);
+#endif /* NFT INET NAT */
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 78a9e6454ff3..5b37487d9d11 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -1,18 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6
* NAT funded by Astaro.
*/
#include <linux/if.h>
#include <linux/inetdevice.h>
+#include <linux/in.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
@@ -27,79 +25,104 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_redirect.h>
+static unsigned int
+nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
+ const union nf_inet_addr *newdst)
+{
+ struct nf_nat_range2 newrange;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+ memset(&newrange, 0, sizeof(newrange));
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
+ newrange.min_addr = *newdst;
+ newrange.max_addr = *newdst;
+ newrange.min_proto = range->min_proto;
+ newrange.max_proto = range->max_proto;
+
+ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
- const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be32 newdst;
- struct nf_nat_range2 newrange;
+ union nf_inet_addr newdst = {};
WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
hooknum != NF_INET_LOCAL_OUT);
- ct = nf_ct_get(skb, &ctinfo);
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
-
/* Local packets: make them go to loopback */
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = htonl(0x7F000001);
+ newdst.ip = htonl(INADDR_LOOPBACK);
} else {
- struct in_device *indev;
- struct in_ifaddr *ifa;
-
- newdst = 0;
+ const struct in_device *indev;
indev = __in_dev_get_rcu(skb->dev);
- if (indev && indev->ifa_list) {
- ifa = indev->ifa_list;
- newdst = ifa->ifa_local;
+ if (indev) {
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(indev->ifa_list);
+ if (ifa)
+ newdst.ip = ifa->ifa_local;
}
- if (!newdst)
+ if (!newdst.ip)
return NF_DROP;
}
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newdst;
- newrange.max_addr.ip = newdst;
- newrange.min_proto = mr->range[0].min;
- newrange.max_proto = mr->range[0].max;
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope)
+{
+ unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr);
+
+ if (ifa_addr_type & IPV6_ADDR_MAPPED)
+ return false;
+
+ if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC)))
+ return false;
+
+ if (scope) {
+ unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK;
+
+ if (!(scope & ifa_scope))
+ return false;
+ }
+
+ return true;
+}
+
unsigned int
nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
unsigned int hooknum)
{
- struct nf_nat_range2 newrange;
- struct in6_addr newdst;
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
+ union nf_inet_addr newdst = {};
- ct = nf_ct_get(skb, &ctinfo);
if (hooknum == NF_INET_LOCAL_OUT) {
- newdst = loopback_addr;
+ newdst.in6 = loopback_addr;
} else {
+ unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
bool addr = false;
idev = __in6_dev_get(skb->dev);
if (idev != NULL) {
+ const struct inet6_ifaddr *ifa;
+
read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
- newdst = ifa->addr;
+ if (!nf_nat_redirect_ipv6_usable(ifa, scope))
+ continue;
+
+ newdst.in6 = ifa->addr;
addr = true;
break;
}
@@ -110,12 +133,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
return NF_DROP;
}
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.in6 = newdst;
- newrange.max_addr.in6 = newdst;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+ return nf_nat_redirect(skb, range, &newdst);
}
EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index aa1be643d7a0..cf4aeb299bde 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* SIP extension for NAT alteration.
*
* (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
* based on RR's ip_nat_ftp.c and other modules.
* (C) 2007 United Security Providers
* (C) 2007, 2008, 2011, 2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -24,11 +21,15 @@
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <linux/netfilter/nf_conntrack_sip.h>
+#define NAT_HELPER_NAME "sip"
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>");
MODULE_DESCRIPTION("SIP NAT helper");
-MODULE_ALIAS("ip_nat_sip");
+MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME);
+static struct nf_conntrack_nat_helper nat_helper_sip =
+ NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME);
static unsigned int mangle_packet(struct sk_buff *skb, unsigned int protoff,
unsigned int dataoff,
@@ -281,7 +282,7 @@ next:
if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) {
struct udphdr *uh;
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
@@ -409,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
exp->dir = !dir;
exp->expectfn = nf_nat_sip_expected;
- for (; port != 0; port++) {
- int ret;
-
- exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- port = 0;
- break;
- }
- }
-
+ port = nf_nat_exp_find_port(exp, port);
if (port == 0) {
nf_ct_helper_log(skb, ct, "all ports in use for SIP");
return NF_DROP;
@@ -606,7 +595,8 @@ static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
int ret;
rtp_exp->tuple.dst.u.udp.port = htons(port);
- ret = nf_ct_expect_related(rtp_exp);
+ ret = nf_ct_expect_related(rtp_exp,
+ NF_CT_EXP_F_SKIP_MASTER);
if (ret == -EBUSY)
continue;
else if (ret < 0) {
@@ -614,7 +604,8 @@ static unsigned int nf_nat_sdp_media(struct sk_buff *skb, unsigned int protoff,
break;
}
rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
- ret = nf_ct_expect_related(rtcp_exp);
+ ret = nf_ct_expect_related(rtcp_exp,
+ NF_CT_EXP_F_SKIP_MASTER);
if (ret == 0)
break;
else if (ret == -EBUSY) {
@@ -656,8 +647,8 @@ static struct nf_ct_helper_expectfn sip_nat = {
static void __exit nf_nat_sip_fini(void)
{
+ nf_nat_helper_unregister(&nat_helper_sip);
RCU_INIT_POINTER(nf_nat_sip_hooks, NULL);
-
nf_ct_helper_expectfn_unregister(&sip_nat);
synchronize_rcu();
}
@@ -675,6 +666,7 @@ static const struct nf_nat_sip_hooks sip_hooks = {
static int __init nf_nat_sip_init(void)
{
BUG_ON(nf_nat_sip_hooks != NULL);
+ nf_nat_helper_register(&nat_helper_sip);
RCU_INIT_POINTER(nf_nat_sip_hooks, &sip_hooks);
nf_ct_helper_expectfn_register(&sip_nat);
return 0;
diff --git a/net/netfilter/nf_nat_tftp.c b/net/netfilter/nf_nat_tftp.c
index 7f67e1d5310d..1a591132d6eb 100644
--- a/net/netfilter/nf_nat_tftp.c
+++ b/net/netfilter/nf_nat_tftp.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -13,10 +10,15 @@
#include <net/netfilter/nf_nat_helper.h>
#include <linux/netfilter/nf_conntrack_tftp.h>
+#define NAT_HELPER_NAME "tftp"
+
MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
MODULE_DESCRIPTION("TFTP NAT helper");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_tftp");
+MODULE_ALIAS_NF_NAT_HELPER(NAT_HELPER_NAME);
+
+static struct nf_conntrack_nat_helper nat_helper_tftp =
+ NF_CT_NAT_HELPER_INIT(NAT_HELPER_NAME);
static unsigned int help(struct sk_buff *skb,
enum ip_conntrack_info ctinfo,
@@ -28,7 +30,7 @@ static unsigned int help(struct sk_buff *skb,
= ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
exp->dir = IP_CT_DIR_REPLY;
exp->expectfn = nf_nat_follow_master;
- if (nf_ct_expect_related(exp) != 0) {
+ if (nf_ct_expect_related(exp, 0) != 0) {
nf_ct_helper_log(skb, exp->master, "cannot add expectation");
return NF_DROP;
}
@@ -37,6 +39,7 @@ static unsigned int help(struct sk_buff *skb,
static void __exit nf_nat_tftp_fini(void)
{
+ nf_nat_helper_unregister(&nat_helper_tftp);
RCU_INIT_POINTER(nf_nat_tftp_hook, NULL);
synchronize_rcu();
}
@@ -44,6 +47,7 @@ static void __exit nf_nat_tftp_fini(void)
static int __init nf_nat_tftp_init(void)
{
BUG_ON(nf_nat_tftp_hook != NULL);
+ nf_nat_helper_register(&nat_helper_tftp);
RCU_INIT_POINTER(nf_nat_tftp_hook, help);
return 0;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index a36a77bae1d6..7f12e56e6e52 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -21,6 +21,8 @@
#include "nf_internals.h"
+static const struct nf_queue_handler __rcu *nf_queue_handler;
+
/*
* Hook for nfnetlink_queue to register its queue handler.
* We do this so that most of the NFQUEUE code can be modular.
@@ -29,88 +31,84 @@
* receives, no matter what.
*/
-/* return EBUSY when somebody else is registered, return EEXIST if the
- * same handler is registered, return 0 in case of success. */
-void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh)
+void nf_register_queue_handler(const struct nf_queue_handler *qh)
{
/* should never happen, we only have one queueing backend in kernel */
- WARN_ON(rcu_access_pointer(net->nf.queue_handler));
- rcu_assign_pointer(net->nf.queue_handler, qh);
+ WARN_ON(rcu_access_pointer(nf_queue_handler));
+ rcu_assign_pointer(nf_queue_handler, qh);
}
EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
-void nf_unregister_queue_handler(struct net *net)
+void nf_unregister_queue_handler(void)
{
- RCU_INIT_POINTER(net->nf.queue_handler, NULL);
+ RCU_INIT_POINTER(nf_queue_handler, NULL);
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb)
+static void nf_queue_sock_put(struct sock *sk)
{
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
-
- if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_put(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_put(physdev);
- }
+#ifdef CONFIG_INET
+ sock_gen_put(sk);
+#else
+ sock_put(sk);
#endif
}
-void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
/* Release those devices we held, or Alexey will kill me. */
- if (state->in)
- dev_put(state->in);
- if (state->out)
- dev_put(state->out);
+ dev_put(state->in);
+ dev_put(state->out);
if (state->sk)
- sock_put(state->sk);
+ nf_queue_sock_put(state->sk);
- nf_queue_entry_release_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ dev_put(entry->physin);
+ dev_put(entry->physout);
+#endif
}
-EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
-static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb)
+void nf_queue_entry_free(struct nf_queue_entry *entry)
{
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_free);
- if (nf_bridge) {
- struct net_device *physdev;
+static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ const struct sk_buff *skb = entry->skb;
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_hold(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_hold(physdev);
+ if (nf_bridge_info_exists(skb)) {
+ entry->physin = nf_bridge_get_physindev(skb, entry->state.net);
+ entry->physout = nf_bridge_get_physoutdev(skb);
+ } else {
+ entry->physin = NULL;
+ entry->physout = NULL;
}
#endif
}
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
- if (state->in)
- dev_hold(state->in);
- if (state->out)
- dev_hold(state->out);
- if (state->sk)
- sock_hold(state->sk);
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
- nf_queue_entry_get_br_nf_refs(entry->skb);
+ dev_hold(state->in);
+ dev_hold(state->out);
+
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ dev_hold(entry->physin);
+ dev_hold(entry->physout);
+#endif
+ return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -119,7 +117,7 @@ void nf_queue_nf_hook_drop(struct net *net)
const struct nf_queue_handler *qh;
rcu_read_lock();
- qh = rcu_dereference(net->nf.queue_handler);
+ qh = rcu_dereference(nf_queue_handler);
if (qh)
qh->nf_hook_drop(net);
rcu_read_unlock();
@@ -156,21 +154,17 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
}
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
- const struct nf_hook_entries *entries,
unsigned int index, unsigned int queuenum)
{
- int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
const struct nf_queue_handler *qh;
- struct net *net = state->net;
unsigned int route_key_size;
+ int status;
/* QUEUE == DROP if no one is waiting, to be safe. */
- qh = rcu_dereference(net->nf.queue_handler);
- if (!qh) {
- status = -ESRCH;
- goto err;
- }
+ qh = rcu_dereference(nf_queue_handler);
+ if (!qh)
+ return -ESRCH;
switch (state->pf) {
case AF_INET:
@@ -184,10 +178,25 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
break;
}
+ if (skb_sk_is_prefetched(skb)) {
+ struct sock *sk = skb->sk;
+
+ if (!sk_is_refcounted(sk)) {
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ return -ENOTCONN;
+
+ /* drop refcount on skb_orphan */
+ skb->destructor = sock_edemux;
+ }
+ }
+
entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
- if (!entry) {
- status = -ENOMEM;
- goto err;
+ if (!entry)
+ return -ENOMEM;
+
+ if (skb_dst(skb) && !skb_dst_force(skb)) {
+ kfree(entry);
+ return -ENETDOWN;
}
*entry = (struct nf_queue_entry) {
@@ -197,8 +206,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
.size = sizeof(*entry) + route_key_size,
};
- nf_queue_entry_get_refs(entry);
- skb_dst_force(skb);
+ __nf_queue_entry_init_physdevs(entry);
+
+ if (!nf_queue_entry_get_refs(entry)) {
+ kfree(entry);
+ return -ENOTCONN;
+ }
switch (entry->state.pf) {
case AF_INET:
@@ -210,27 +223,21 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
status = qh->outfn(entry, queuenum);
-
if (status < 0) {
- nf_queue_entry_release_refs(entry);
- goto err;
+ nf_queue_entry_free(entry);
+ return status;
}
return 0;
-
-err:
- kfree(entry);
- return status;
}
/* Packets leaving via this function must come back through nf_reinject(). */
int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
- const struct nf_hook_entries *entries, unsigned int index,
- unsigned int verdict)
+ unsigned int index, unsigned int verdict)
{
int ret;
- ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS);
+ ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS);
if (ret < 0) {
if (ret == -ESRCH &&
(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
@@ -240,110 +247,4 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
return 0;
}
-
-static unsigned int nf_iterate(struct sk_buff *skb,
- struct nf_hook_state *state,
- const struct nf_hook_entries *hooks,
- unsigned int *index)
-{
- const struct nf_hook_entry *hook;
- unsigned int verdict, i = *index;
-
- while (i < hooks->num_hook_entries) {
- hook = &hooks->hooks[i];
-repeat:
- verdict = nf_hook_entry_hookfn(hook, skb, state);
- if (verdict != NF_ACCEPT) {
- if (verdict != NF_REPEAT)
- return verdict;
- goto repeat;
- }
- i++;
- }
-
- *index = i;
- return NF_ACCEPT;
-}
-
-static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
-{
- switch (pf) {
-#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
- case NFPROTO_BRIDGE:
- return rcu_dereference(net->nf.hooks_bridge[hooknum]);
-#endif
- case NFPROTO_IPV4:
- return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
- case NFPROTO_IPV6:
- return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
- default:
- WARN_ON_ONCE(1);
- return NULL;
- }
-
- return NULL;
-}
-
-/* Caller must hold rcu read-side lock */
-void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
-{
- const struct nf_hook_entry *hook_entry;
- const struct nf_hook_entries *hooks;
- struct sk_buff *skb = entry->skb;
- const struct net *net;
- unsigned int i;
- int err;
- u8 pf;
-
- net = entry->state.net;
- pf = entry->state.pf;
-
- hooks = nf_hook_entries_head(net, pf, entry->state.hook);
-
- nf_queue_entry_release_refs(entry);
-
- i = entry->hook_index;
- if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
- kfree_skb(skb);
- kfree(entry);
- return;
- }
-
- hook_entry = &hooks->hooks[i];
-
- /* Continue traversal iff userspace said ok... */
- if (verdict == NF_REPEAT)
- verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
-
- if (verdict == NF_ACCEPT) {
- if (nf_reroute(skb, entry) < 0)
- verdict = NF_DROP;
- }
-
- if (verdict == NF_ACCEPT) {
-next_hook:
- ++i;
- verdict = nf_iterate(skb, &entry->state, hooks, &i);
- }
-
- switch (verdict & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_STOP:
- local_bh_disable();
- entry->state.okfn(entry->state.net, entry->state.sk, skb);
- local_bh_enable();
- break;
- case NF_QUEUE:
- err = nf_queue(skb, &entry->state, hooks, i, verdict);
- if (err == 1)
- goto next_hook;
- break;
- case NF_STOLEN:
- break;
- default:
- kfree_skb(skb);
- }
-
- kfree(entry);
-}
-EXPORT_SYMBOL(nf_reinject);
+EXPORT_SYMBOL_GPL(nf_queue);
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 46cb3786e0ec..34afcd03b6f6 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -89,78 +89,32 @@ out:
return ops;
}
-/* Call get/setsockopt() */
-static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
+int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, sockptr_t opt,
+ unsigned int len)
{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 0);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get)
- ret = ops->get(sk, val, opt, len);
- else
- ret = ops->set(sk, val, opt, *len);
-
+ ret = ops->set(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
- unsigned int len)
-{
- return nf_sockopt(sk, pf, val, opt, &len, 0);
-}
EXPORT_SYMBOL(nf_setsockopt);
int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt,
int *len)
{
- return nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(nf_getsockopt);
-
-#ifdef CONFIG_COMPAT
-static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
- char __user *opt, int *len, int get)
-{
struct nf_sockopt_ops *ops;
int ret;
- ops = nf_sockopt_find(sk, pf, val, get);
+ ops = nf_sockopt_find(sk, pf, val, 1);
if (IS_ERR(ops))
return PTR_ERR(ops);
-
- if (get) {
- if (ops->compat_get)
- ret = ops->compat_get(sk, val, opt, len);
- else
- ret = ops->get(sk, val, opt, len);
- } else {
- if (ops->compat_set)
- ret = ops->compat_set(sk, val, opt, *len);
- else
- ret = ops->set(sk, val, opt, *len);
- }
-
+ ret = ops->get(sk, val, opt, len);
module_put(ops->owner);
return ret;
}
-
-int compat_nf_setsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, unsigned int len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, &len, 0);
-}
-EXPORT_SYMBOL(compat_nf_setsockopt);
-
-int compat_nf_getsockopt(struct sock *sk, u_int8_t pf,
- int val, char __user *opt, int *len)
-{
- return compat_nf_sockopt(sk, pf, val, opt, len, 1);
-}
-EXPORT_SYMBOL(compat_nf_getsockopt);
-#endif
+EXPORT_SYMBOL(nf_getsockopt);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8ff4d22f10b2..3fa3f5dfb264 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -1,28 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/tcp.h>
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter/xt_tcpudp.h>
-#include <linux/netfilter/xt_SYNPROXY.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_synproxy.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_synproxy.h>
unsigned int synproxy_net_id;
EXPORT_SYMBOL_GPL(synproxy_net_id);
@@ -34,6 +31,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
int length = (th->doff * 4) - sizeof(*th);
u8 buf[40], *ptr;
+ if (unlikely(length < 0))
+ return false;
+
ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
if (ptr == NULL)
return false;
@@ -50,6 +50,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
length--;
continue;
default:
+ if (length < 2)
+ return true;
opsize = *ptr++;
if (opsize < 2)
return true;
@@ -59,8 +61,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
switch (opcode) {
case TCPOPT_MSS:
if (opsize == TCPOLEN_MSS) {
- opts->mss = get_unaligned_be16(ptr);
- opts->options |= XT_SYNPROXY_OPT_MSS;
+ opts->mss_option = get_unaligned_be16(ptr);
+ opts->options |= NF_SYNPROXY_OPT_MSS;
}
break;
case TCPOPT_WINDOW:
@@ -68,19 +70,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
opts->wscale = *ptr;
if (opts->wscale > TCP_MAX_WSCALE)
opts->wscale = TCP_MAX_WSCALE;
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
}
break;
case TCPOPT_TIMESTAMP:
if (opsize == TCPOLEN_TIMESTAMP) {
opts->tsval = get_unaligned_be32(ptr);
opts->tsecr = get_unaligned_be32(ptr + 4);
- opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+ opts->options |= NF_SYNPROXY_OPT_TIMESTAMP;
}
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM)
- opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+ opts->options |= NF_SYNPROXY_OPT_SACK_PERM;
break;
}
@@ -92,36 +94,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
}
EXPORT_SYMBOL_GPL(synproxy_parse_options);
-unsigned int synproxy_options_size(const struct synproxy_options *opts)
+static unsigned int
+synproxy_options_size(const struct synproxy_options *opts)
{
unsigned int size = 0;
- if (opts->options & XT_SYNPROXY_OPT_MSS)
+ if (opts->options & NF_SYNPROXY_OPT_MSS)
size += TCPOLEN_MSS_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
size += TCPOLEN_TSTAMP_ALIGNED;
- else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
size += TCPOLEN_SACKPERM_ALIGNED;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE)
size += TCPOLEN_WSCALE_ALIGNED;
return size;
}
-EXPORT_SYMBOL_GPL(synproxy_options_size);
-void
+static void
synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
{
__be32 *ptr = (__be32 *)(th + 1);
u8 options = opts->options;
- if (options & XT_SYNPROXY_OPT_MSS)
+ if (options & NF_SYNPROXY_OPT_MSS)
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
- opts->mss);
+ opts->mss_option);
- if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
- if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
@@ -134,58 +136,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
*ptr++ = htonl(opts->tsval);
*ptr++ = htonl(opts->tsecr);
- } else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+ } else if (options & NF_SYNPROXY_OPT_SACK_PERM)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
- if (options & XT_SYNPROXY_OPT_WSCALE)
+ if (options & NF_SYNPROXY_OPT_WSCALE)
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->wscale);
}
-EXPORT_SYMBOL_GPL(synproxy_build_options);
-void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
- opts->tsval = tcp_time_stamp_raw() & ~0x3f;
+ opts->tsval = tcp_clock_ms() & ~0x3f;
- if (opts->options & XT_SYNPROXY_OPT_WSCALE) {
+ if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
opts->wscale = info->wscale;
} else
opts->tsval |= 0xf;
- if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+ if (opts->options & NF_SYNPROXY_OPT_SACK_PERM)
opts->tsval |= 1 << 4;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
opts->tsval |= 1 << 5;
}
EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
-void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+static void
+synproxy_check_timestamp_cookie(struct synproxy_options *opts)
{
opts->wscale = opts->tsecr & 0xf;
if (opts->wscale != 0xf)
- opts->options |= XT_SYNPROXY_OPT_WSCALE;
+ opts->options |= NF_SYNPROXY_OPT_WSCALE;
- opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+ opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0;
- opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+ opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0;
}
-EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
-unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
- unsigned int protoff,
- struct tcphdr *th,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- const struct nf_conn_synproxy *synproxy)
+static unsigned int
+synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
+ struct tcphdr *th, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_conn_synproxy *synproxy)
{
unsigned int optoff, optend;
__be32 *ptr, old;
@@ -196,7 +196,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + th->doff * 4;
- if (!skb_make_writable(skb, optend))
+ if (skb_ensure_writable(skb, optend))
return 0;
while (optoff < optend) {
@@ -235,13 +235,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
}
return 1;
}
-EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
-
-static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
- .len = sizeof(struct nf_conn_synproxy),
- .align = __alignof__(struct nf_conn_synproxy),
- .id = NF_CT_EXT_SYNPROXY,
-};
#ifdef CONFIG_PROC_FS
static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
@@ -273,7 +266,7 @@ static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
-
+ (*pos)++;
return NULL;
}
@@ -350,7 +343,6 @@ static int __net_init synproxy_net_init(struct net *net)
goto err2;
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
snet->tmpl = ct;
snet->stats = alloc_percpu(struct synproxy_stats);
@@ -389,32 +381,840 @@ static struct pernet_operations synproxy_net_ops = {
static int __init synproxy_core_init(void)
{
- int err;
+ return register_pernet_subsys(&synproxy_net_ops);
+}
- err = nf_ct_extend_register(&nf_ct_synproxy_extend);
- if (err < 0)
- goto err1;
+static void __exit synproxy_core_exit(void)
+{
+ unregister_pernet_subsys(&synproxy_net_ops);
+}
- err = register_pernet_subsys(&synproxy_net_ops);
- if (err < 0)
- goto err2;
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+static struct iphdr *
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
+{
+ struct iphdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) / 4;
+ iph->tos = 0;
+ iph->id = 0;
+ iph->frag_off = htons(IP_DF);
+ iph->ttl = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
+ iph->protocol = IPPROTO_TCP;
+ iph->check = 0;
+ iph->saddr = saddr;
+ iph->daddr = daddr;
+
+ return iph;
+}
+
+static void
+synproxy_send_tcp(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct iphdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ skb_dst_set_noref(nskb, skb_dst(skb));
+ nskb->protocol = htons(ETH_P_IP);
+ if (ip_route_me_harder(net, nskb->sk, nskb, RTN_UNSPEC))
+ goto free_nskb;
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss_encode;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack);
+
+static void
+synproxy_send_server_syn(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(struct net *net,
+ const struct ip_ct_tcp *state,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct iphdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ip_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack(struct net *net,
+ const struct sk_buff *skb, const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = __cookie_v4_check(ip_hdr(skb), th);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss_option = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack);
+
+unsigned int
+ipv4_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ unsigned int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb) ||
+ ip_hdr(skb)->protocol != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ thoff = ip_hdrlen(skb);
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+ fallthrough;
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv4_synproxy_hook);
+
+static const struct nf_hook_ops ipv4_synproxy_ops[] = {
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv4_synproxy_hook,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref4 == 0) {
+ err = nf_register_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
+ if (err)
+ return err;
+ }
+ snet->hook_ref4++;
return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init);
-err2:
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
-err1:
- return err;
+void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref4--;
+ if (snet->hook_ref4 == 0)
+ nf_unregister_net_hooks(net, ipv4_synproxy_ops,
+ ARRAY_SIZE(ipv4_synproxy_ops));
}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini);
-static void __exit synproxy_core_exit(void)
+#if IS_ENABLED(CONFIG_IPV6)
+static struct ipv6hdr *
+synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
{
- unregister_pernet_subsys(&synproxy_net_ops);
- nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+ struct ipv6hdr *iph;
+
+ skb_reset_network_header(skb);
+ iph = skb_put(skb, sizeof(*iph));
+ ip6_flow_hdr(iph, 0, 0);
+ iph->hop_limit = READ_ONCE(net->ipv6.devconf_all->hop_limit);
+ iph->nexthdr = IPPROTO_TCP;
+ iph->saddr = *saddr;
+ iph->daddr = *daddr;
+
+ return iph;
}
-module_init(synproxy_core_init);
-module_exit(synproxy_core_exit);
+static void
+synproxy_send_tcp_ipv6(struct net *net,
+ const struct sk_buff *skb, struct sk_buff *nskb,
+ struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+ struct ipv6hdr *niph, struct tcphdr *nth,
+ unsigned int tcp_hdr_size)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6;
+ int err;
+
+ nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)nth - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_TCP;
+ fl6.saddr = niph->saddr;
+ fl6.daddr = niph->daddr;
+ fl6.fl6_sport = nth->source;
+ fl6.fl6_dport = nth->dest;
+ security_skb_classify_flow((struct sk_buff *)skb,
+ flowi6_to_flowi_common(&fl6));
+ err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (err) {
+ goto free_nskb;
+ }
+
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst))
+ goto free_nskb;
+
+ skb_dst_set(nskb, dst);
+
+ if (nfct) {
+ nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
+ nf_conntrack_get(nfct);
+ }
+
+ ip6_local_out(net, nskb->sk, nskb);
+ return;
+
+free_nskb:
+ kfree_skb(nskb);
+}
+
+void
+synproxy_send_client_synack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+ u16 mss = opts->mss_encode;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = 0;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6);
+
+static void
+synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(recv_seq - 1);
+ /* ack_seq is used to relay our ISN to the synproxy hook to initialize
+ * sequence number translation once a connection tracking entry exists.
+ */
+ nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
+ tcp_flag_word(nth) = TCP_FLAG_SYN;
+ if (opts->options & NF_SYNPROXY_OPT_ECN)
+ tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = th->window;
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general,
+ IP_CT_NEW, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->dest;
+ nth->dest = th->source;
+ nth->seq = htonl(ntohl(th->ack_seq));
+ nth->ack_seq = htonl(ntohl(th->seq) + 1);
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth,
+ tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb,
+ const struct tcphdr *th,
+ const struct synproxy_options *opts)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *iph, *niph;
+ struct tcphdr *nth;
+ unsigned int tcp_hdr_size;
+
+ iph = ipv6_hdr(skb);
+
+ tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+ nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+ GFP_ATOMIC);
+ if (!nskb)
+ return;
+ skb_reserve(nskb, MAX_TCP_HEADER);
+
+ niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr);
+
+ skb_reset_transport_header(nskb);
+ nth = skb_put(nskb, tcp_hdr_size);
+ nth->source = th->source;
+ nth->dest = th->dest;
+ nth->seq = htonl(ntohl(th->seq) + 1);
+ nth->ack_seq = th->ack_seq;
+ tcp_flag_word(nth) = TCP_FLAG_ACK;
+ nth->doff = tcp_hdr_size / 4;
+ nth->window = htons(ntohs(th->window) >> opts->wscale);
+ nth->check = 0;
+ nth->urg_ptr = 0;
+
+ synproxy_build_options(nth, opts);
+
+ synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb),
+ IP_CT_ESTABLISHED_REPLY, niph, nth,
+ tcp_hdr_size);
+}
+
+bool
+synproxy_recv_client_ack_ipv6(struct net *net,
+ const struct sk_buff *skb,
+ const struct tcphdr *th,
+ struct synproxy_options *opts, u32 recv_seq)
+{
+ struct synproxy_net *snet = synproxy_pernet(net);
+ int mss;
+
+ mss = nf_cookie_v6_check(ipv6_hdr(skb), th);
+ if (mss == 0) {
+ this_cpu_inc(snet->stats->cookie_invalid);
+ return false;
+ }
+
+ this_cpu_inc(snet->stats->cookie_valid);
+ opts->mss_option = mss;
+ opts->options |= NF_SYNPROXY_OPT_MSS;
+
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_check_timestamp_cookie(opts);
+
+ synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6);
+
+unsigned int
+ipv6_synproxy_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *nhs)
+{
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct nf_conn_synproxy *synproxy;
+ struct synproxy_options opts = {};
+ const struct ip_ct_tcp *state;
+ struct tcphdr *th, _th;
+ __be16 frag_off;
+ u8 nexthdr;
+ int thoff;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ synproxy = nfct_synproxy(ct);
+ if (!synproxy)
+ return NF_ACCEPT;
+
+ if (nf_is_loopback_packet(skb))
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (thoff < 0 || nexthdr != IPPROTO_TCP)
+ return NF_ACCEPT;
+
+ th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+ if (!th)
+ return NF_DROP;
+
+ state = &ct->proto.tcp;
+ switch (state->state) {
+ case TCP_CONNTRACK_CLOSE:
+ if (th->rst && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+ ntohl(th->seq) + 1);
+ break;
+ }
+
+ if (!th->syn || th->ack ||
+ CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ break;
+
+ /* Reopened connection - reset the sequence number and timestamp
+ * adjustments, they will get initialized once the connection is
+ * reestablished.
+ */
+ nf_ct_seqadj_init(ct, ctinfo, 0);
+ synproxy->tsoff = 0;
+ this_cpu_inc(snet->stats->conn_reopened);
+ fallthrough;
+ case TCP_CONNTRACK_SYN_SENT:
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (!th->syn && th->ack &&
+ CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+ /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+ * therefore we need to add 1 to make the SYN sequence
+ * number match the one of first SYN.
+ */
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq) + 1)) {
+ this_cpu_inc(snet->stats->cookie_retrans);
+ consume_skb(skb);
+ return NF_STOLEN;
+ } else {
+ return NF_DROP;
+ }
+ }
+
+ synproxy->isn = ntohl(th->ack_seq);
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy->its = opts.tsecr;
+
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ break;
+ case TCP_CONNTRACK_SYN_RECV:
+ if (!th->syn || !th->ack)
+ break;
+
+ if (!synproxy_parse_options(skb, thoff, th, &opts))
+ return NF_DROP;
+
+ if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) {
+ synproxy->tsoff = opts.tsval - synproxy->its;
+ nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+ }
+
+ opts.options &= ~(NF_SYNPROXY_OPT_MSS |
+ NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_server_ack_ipv6(net, state, skb, th, &opts);
+
+ nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+ nf_conntrack_event_cache(IPCT_SEQADJ, ct);
+
+ swap(opts.tsval, opts.tsecr);
+ synproxy_send_client_ack_ipv6(net, skb, th, &opts);
+
+ consume_skb(skb);
+ return NF_STOLEN;
+ default:
+ break;
+ }
+
+ synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(ipv6_synproxy_hook);
+
+static const struct nf_hook_ops ipv6_synproxy_ops[] = {
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+ {
+ .hook = ipv6_synproxy_hook,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+ },
+};
+
+int
+nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net)
+{
+ int err;
+
+ if (snet->hook_ref6 == 0) {
+ err = nf_register_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+ if (err)
+ return err;
+ }
+
+ snet->hook_ref6++;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init);
+
+void
+nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net)
+{
+ snet->hook_ref6--;
+ if (snet->hook_ref6 == 0)
+ nf_unregister_net_hooks(net, ipv6_synproxy_ops,
+ ARRAY_SIZE(ipv6_synproxy_ops));
+}
+EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini);
+#endif /* CONFIG_IPV6 */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fb07f6cfc719..f3de2f9bbebf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -15,21 +12,31 @@
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/rhashtable.h>
+#include <linux/audit.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
+#define NFT_SET_MAX_ANONLEN 16
+
+/* limit compaction to avoid huge kmalloc/krealloc sizes. */
+#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))
+
+unsigned int nf_tables_net_id __read_mostly;
+
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
-static u64 table_handle;
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
enum {
NFT_VALIDATE_SKIP = 0,
@@ -37,23 +44,73 @@ enum {
NFT_VALIDATE_DO,
};
+static struct rhltable nft_objname_ht;
+
static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);
+static u32 nft_objname_hash(const void *data, u32 len, u32 seed);
+static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed);
+static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *);
+
static const struct rhashtable_params nft_chain_ht_params = {
.head_offset = offsetof(struct nft_chain, rhlhead),
.key_offset = offsetof(struct nft_chain, name),
.hashfn = nft_chain_hash,
.obj_hashfn = nft_chain_hash_obj,
.obj_cmpfn = nft_chain_hash_cmp,
- .locks_mul = 1,
.automatic_shrinking = true,
};
-static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+static const struct rhashtable_params nft_objname_ht_params = {
+ .head_offset = offsetof(struct nft_object, rhlhead),
+ .key_offset = offsetof(struct nft_object, key),
+ .hashfn = nft_objname_hash,
+ .obj_hashfn = nft_objname_hash_obj,
+ .obj_cmpfn = nft_objname_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+struct nft_audit_data {
+ struct nft_table *table;
+ int entries;
+ int op;
+ struct list_head list;
+};
+
+static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
+ [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER,
+ [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER,
+ [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER,
+ [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER,
+ [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER,
+ [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER,
+ [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER,
+ [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER,
+ [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER,
+ [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER,
+ [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER,
+ [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER,
+ [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER,
+ [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET,
+ [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
+ [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID,
+ [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
+ [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
+};
+
+static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
{
- switch (net->nft.validate_state) {
+ switch (table->validate_state) {
case NFT_VALIDATE_SKIP:
WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
break;
@@ -64,10 +121,12 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
return;
}
- net->nft.validate_state = new_validate_state;
+ table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
-static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
+
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
@@ -86,41 +145,233 @@ static void nft_ctx_init(struct nft_ctx *ctx,
ctx->nla = nla;
ctx->portid = NETLINK_CB(skb).portid;
ctx->report = nlmsg_report(nlh);
+ ctx->flags = nlh->nlmsg_flags;
ctx->seq = nlh->nlmsg_seq;
+
+ bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}
-static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
- int msg_type, u32 size, gfp_t gfp)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
{
struct nft_trans *trans;
- trans = kzalloc(sizeof(struct nft_trans) + size, gfp);
+ trans = kzalloc(size, GFP_KERNEL);
if (trans == NULL)
return NULL;
+ INIT_LIST_HEAD(&trans->list);
trans->msg_type = msg_type;
- trans->ctx = *ctx;
+
+ trans->net = ctx->net;
+ trans->table = ctx->table;
+ trans->seq = ctx->seq;
+ trans->flags = ctx->flags;
+ trans->report = ctx->report;
return trans;
}
-static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
- int msg_type, u32 size)
+static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ case NFT_MSG_NEWSET:
+ return container_of(trans, struct nft_trans_binding, nft_trans);
+ }
+
+ return NULL;
+}
+
+static void nft_trans_list_del(struct nft_trans *trans)
{
- return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
+ struct nft_trans_binding *trans_binding;
+
+ list_del(&trans->list);
+
+ trans_binding = nft_trans_get_binding(trans);
+ if (trans_binding)
+ list_del(&trans_binding->binding_list);
}
static void nft_trans_destroy(struct nft_trans *trans)
{
- list_del(&trans->list);
+ nft_trans_list_del(trans);
kfree(trans);
}
+static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set,
+ bool bind)
+{
+ struct nftables_pernet *nft_net;
+ struct net *net = ctx->net;
+ struct nft_trans *trans;
+
+ if (!nft_set_is_anonymous(set))
+ return;
+
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ if (nft_trans_set(trans) == set)
+ nft_trans_set_bound(trans) = bind;
+ break;
+ case NFT_MSG_NEWSETELEM:
+ if (nft_trans_elem_set(trans) == set)
+ nft_trans_elem_set_bound(trans) = bind;
+ break;
+ }
+ }
+}
+
+static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, true);
+}
+
+static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ return __nft_set_trans_bind(ctx, set, false);
+}
+
+static void __nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain, bool bind)
+{
+ struct nftables_pernet *nft_net;
+ struct net *net = ctx->net;
+ struct nft_trans *trans;
+
+ if (!nft_chain_binding(chain))
+ return;
+
+ nft_net = nft_pernet(net);
+ list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (nft_trans_chain(trans) == chain)
+ nft_trans_chain_bound(trans) = bind;
+ break;
+ case NFT_MSG_NEWRULE:
+ if (nft_trans_rule_chain(trans) == chain)
+ nft_trans_rule_bound(trans) = bind;
+ break;
+ }
+ }
+}
+
+static void nft_chain_trans_bind(const struct nft_ctx *ctx,
+ struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, true);
+}
+
+int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ if (!nft_chain_binding(chain))
+ return 0;
+
+ if (nft_chain_binding(ctx->chain))
+ return -EOPNOTSUPP;
+
+ if (chain->bound)
+ return -EBUSY;
+
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
+
+ chain->bound = true;
+ nft_chain_trans_bind(ctx, chain);
+
+ return 0;
+}
+
+void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+ __nft_chain_trans_bind(ctx, chain, false);
+}
+
+static int nft_netdev_register_hooks(struct net *net,
+ struct list_head *hook_list)
+{
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ int err, j;
+
+ j = 0;
+ list_for_each_entry(hook, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nf_register_net_hook(net, ops);
+ if (err < 0)
+ goto err_register;
+
+ j++;
+ }
+ }
+ return 0;
+
+err_register:
+ list_for_each_entry(hook, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (j-- <= 0)
+ break;
+
+ nf_unregister_net_hook(net, ops);
+ }
+ }
+ return err;
+}
+
+static void nft_netdev_hook_free_ops(struct nft_hook *hook)
+{
+ struct nf_hook_ops *ops, *next;
+
+ list_for_each_entry_safe(ops, next, &hook->ops_list, list) {
+ list_del(&ops->list);
+ kfree(ops);
+ }
+}
+
+static void nft_netdev_hook_free(struct nft_hook *hook)
+{
+ nft_netdev_hook_free_ops(hook);
+ kfree(hook);
+}
+
+static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu)
+{
+ struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu);
+
+ nft_netdev_hook_free(hook);
+}
+
+static void nft_netdev_hook_free_rcu(struct nft_hook *hook)
+{
+ call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu);
+}
+
+static void nft_netdev_unregister_hooks(struct net *net,
+ struct list_head *hook_list,
+ bool release_netdev)
+{
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nf_unregister_net_hook(net, ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ }
+}
+
static int nf_tables_register_hook(struct net *net,
const struct nft_table *table,
struct nft_chain *chain)
{
- const struct nft_base_chain *basechain;
+ struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -133,14 +384,18 @@ static int nf_tables_register_hook(struct net *net,
if (basechain->type->ops_register)
return basechain->type->ops_register(net, ops);
- return nf_register_net_hook(net, ops);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_netdev_register_hooks(net, &basechain->hook_list);
+
+ return nf_register_net_hook(net, &basechain->ops);
}
-static void nf_tables_unregister_hook(struct net *net,
- const struct nft_table *table,
- struct nft_chain *chain)
+static void __nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain,
+ bool release_netdev)
{
- const struct nft_base_chain *basechain;
+ struct nft_base_chain *basechain;
const struct nf_hook_ops *ops;
if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -152,7 +407,144 @@ static void nf_tables_unregister_hook(struct net *net,
if (basechain->type->ops_unregister)
return basechain->type->ops_unregister(net, ops);
- nf_unregister_net_hook(net, ops);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ nft_netdev_unregister_hooks(net, &basechain->hook_list,
+ release_netdev);
+ else
+ nf_unregister_net_hook(net, &basechain->ops);
+}
+
+static void nf_tables_unregister_hook(struct net *net,
+ const struct nft_table *table,
+ struct nft_chain *chain)
+{
+ return __nf_tables_unregister_hook(net, table, chain, false);
+}
+
+static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
+{
+ /* NB: the ->bound equality check is defensive, at this time we only merge
+ * a new nft_trans_elem transaction request with the transaction tail
+ * element, but a->bound != b->bound would imply a NEWRULE transaction
+ * is queued in-between.
+ *
+ * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents
+ * huge krealloc() requests.
+ */
+ return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
+}
+
+static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
+ struct nft_trans_elem *tail,
+ struct nft_trans_elem *trans)
+{
+ unsigned int nelems, old_nelems = tail->nelems;
+ struct nft_trans_elem *new_trans;
+
+ if (!nft_trans_collapse_set_elem_allowed(tail, trans))
+ return false;
+
+ /* "cannot happen", at this time userspace element add
+ * requests always allocate a new transaction element.
+ *
+ * This serves as a reminder to adjust the list_add_tail
+ * logic below in case this ever changes.
+ */
+ if (WARN_ON_ONCE(trans->nelems != 1))
+ return false;
+
+ if (check_add_overflow(old_nelems, trans->nelems, &nelems))
+ return false;
+
+ /* krealloc might free tail which invalidates list pointers */
+ list_del_init(&tail->nft_trans.list);
+
+ new_trans = krealloc(tail, struct_size(tail, elems, nelems),
+ GFP_KERNEL);
+ if (!new_trans) {
+ list_add_tail(&tail->nft_trans.list,
+ &nft_net->commit_list);
+ return false;
+ }
+
+ /*
+ * new_trans->nft_trans.list contains garbage, but
+ * list_add_tail() doesn't care.
+ */
+ new_trans->nelems = nelems;
+ new_trans->elems[old_nelems] = trans->elems[0];
+ list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);
+
+ return true;
+}
+
+static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
+ struct nft_trans *trans)
+{
+ struct nft_trans *tail;
+
+ if (list_empty(&nft_net->commit_list))
+ return false;
+
+ tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list);
+
+ if (tail->msg_type != trans->msg_type)
+ return false;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_collapse_set_elem(nft_net,
+ nft_trans_container_elem(tail),
+ nft_trans_container_elem(trans));
+ }
+
+ return false;
+}
+
+static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_trans_binding *binding;
+ struct nft_trans_set *trans_set;
+
+ list_add_tail(&trans->list, &nft_net->commit_list);
+
+ binding = nft_trans_get_binding(trans);
+ if (!binding)
+ return;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ trans_set = nft_trans_container_set(trans);
+
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+
+ list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list);
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)))
+ list_add_tail(&binding->binding_list, &nft_net->binding_list);
+ break;
+ }
+}
+
+static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
+ trans->msg_type != NFT_MSG_DELSETELEM);
+
+ if (nft_trans_try_collapse(nft_net, trans)) {
+ kfree(trans);
+ return;
+ }
+
+ nft_trans_commit_list_add_tail(net, trans);
}
static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -166,7 +558,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
if (msg_type == NFT_MSG_NEWTABLE)
nft_activate_next(ctx->net, ctx->table);
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -182,54 +574,64 @@ static int nft_deltable(struct nft_ctx *ctx)
return err;
}
-static int nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
+static struct nft_trans *
+nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type)
{
+ struct nft_trans_chain *trans_chain;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
- if (trans == NULL)
- return -ENOMEM;
+ if (!trans)
+ return NULL;
- if (msg_type == NFT_MSG_NEWCHAIN)
- nft_activate_next(ctx->net, ctx->chain);
+ trans_chain = nft_trans_container_chain(trans);
+ INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list);
+ trans_chain->chain = ctx->chain;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
- return 0;
+ return trans;
}
-static int nft_delchain(struct nft_ctx *ctx)
+static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
{
- int err;
+ struct nft_trans *trans;
- err = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN);
- if (err < 0)
- return err;
+ trans = nft_trans_alloc_chain(ctx, msg_type);
+ if (trans == NULL)
+ return ERR_PTR(-ENOMEM);
- ctx->table->use--;
- nft_deactivate_next(ctx->net, ctx->chain);
+ if (msg_type == NFT_MSG_NEWCHAIN) {
+ nft_activate_next(ctx->net, ctx->chain);
- return err;
+ if (ctx->nla[NFTA_CHAIN_ID]) {
+ nft_trans_chain_id(trans) =
+ ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
+ }
+ }
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return trans;
}
-/* either expr ops provide both activate/deactivate, or neither */
-static bool nft_expr_check_ops(const struct nft_expr_ops *ops)
+static int nft_delchain(struct nft_ctx *ctx)
{
- if (!ops)
- return true;
+ struct nft_trans *trans;
- if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate)))
- return false;
+ trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- return true;
+ nft_use_dec(&ctx->table->use);
+ nft_deactivate_next(ctx->net, ctx->chain);
+
+ return 0;
}
-static void nft_rule_expr_activate(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->activate)
expr->ops->activate(ctx, expr);
@@ -237,15 +639,15 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
}
}
-static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
+ enum nft_trans_phase phase)
{
struct nft_expr *expr;
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
if (expr->ops->deactivate)
- expr->ops->deactivate(ctx, expr);
+ expr->ops->deactivate(ctx, expr, phase);
expr = nft_expr_next(expr);
}
@@ -257,7 +659,7 @@ nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
/* You cannot delete the same rule twice */
if (nft_is_active_next(ctx->net, rule)) {
nft_deactivate_next(ctx->net, rule);
- ctx->chain->use--;
+ nft_use_dec(&ctx->chain->use);
return 0;
}
return -ENOENT;
@@ -277,13 +679,15 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
}
nft_trans_rule(trans) = rule;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_rule_chain(trans) = ctx->chain;
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return trans;
}
static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
{
+ struct nft_flow_rule *flow;
struct nft_trans *trans;
int err;
@@ -291,12 +695,22 @@ static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
if (trans == NULL)
return -ENOMEM;
+ if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(ctx->net, rule);
+ if (IS_ERR(flow)) {
+ nft_trans_destroy(trans);
+ return PTR_ERR(flow);
+ }
+
+ nft_trans_flow_rule(trans) = flow;
+ }
+
err = nf_tables_delrule_deactivate(ctx, rule);
if (err < 0) {
nft_trans_destroy(trans);
return err;
}
- nft_rule_expr_deactivate(ctx, rule);
+ nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE);
return 0;
}
@@ -307,6 +721,9 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
int err;
list_for_each_entry(rule, &ctx->chain->rules, list) {
+ if (!nft_is_active_next(ctx->net, rule))
+ continue;
+
err = nft_delrule(ctx, rule);
if (err < 0)
return err;
@@ -314,26 +731,98 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx)
return 0;
}
-static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
- struct nft_set *set)
+static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set,
+ const struct nft_set_desc *desc)
{
+ struct nft_trans_set *trans_set;
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
if (trans == NULL)
return -ENOMEM;
- if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] != NULL) {
+ trans_set = nft_trans_container_set(trans);
+ INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list);
+ INIT_LIST_HEAD(&trans_set->list_trans_newset);
+
+ if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) {
nft_trans_set_id(trans) =
ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
nft_activate_next(ctx->net, set);
}
nft_trans_set(trans) = set;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ if (desc) {
+ nft_trans_set_update(trans) = true;
+ nft_trans_set_gc_int(trans) = desc->gc_int;
+ nft_trans_set_timeout(trans) = desc->timeout;
+ nft_trans_set_size(trans) = desc->size;
+ }
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
+static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
+ struct nft_set *set)
+{
+ return __nft_trans_set_add(ctx, msg_type, set, NULL);
+}
+
+static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+
+ return 0;
+}
+
+struct nft_set_elem_catchall {
+ struct list_head list;
+ struct rcu_head rcu;
+ struct nft_elem_priv *elem;
+};
+
+static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_set_elem_change_active(ctx->net, set, ext);
+ nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_deactivate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_deactivate(ctx, set);
+}
+
static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
int err;
@@ -342,8 +831,11 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
if (err < 0)
return err;
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
nft_deactivate_next(ctx->net, set);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
@@ -361,7 +853,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
nft_activate_next(ctx->net, obj);
nft_trans_obj(trans) = obj;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
}
@@ -375,63 +867,125 @@ static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
return err;
nft_deactivate_next(ctx->net, obj);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
return err;
}
-static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
- struct nft_flowtable *flowtable)
+static struct nft_trans *
+nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
+ struct nft_flowtable *flowtable)
{
struct nft_trans *trans;
trans = nft_trans_alloc(ctx, msg_type,
sizeof(struct nft_trans_flowtable));
if (trans == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (msg_type == NFT_MSG_NEWFLOWTABLE)
nft_activate_next(ctx->net, flowtable);
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
nft_trans_flowtable(trans) = flowtable;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
- return 0;
+ return trans;
}
static int nft_delflowtable(struct nft_ctx *ctx,
struct nft_flowtable *flowtable)
{
- int err;
+ struct nft_trans *trans;
- err = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
- if (err < 0)
- return err;
+ trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
nft_deactivate_next(ctx->net, flowtable);
- ctx->table->use--;
+ nft_use_dec(&ctx->table->use);
- return err;
+ return 0;
+}
+
+static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
+{
+ int i;
+
+ for (i = track->regs[dreg].num_reg; i > 0; i--)
+ __nft_reg_track_cancel(track, dreg - i);
+}
+
+static void __nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr,
+ u8 dreg, u8 num_reg)
+{
+ track->regs[dreg].selector = expr;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = num_reg;
}
+void nft_reg_track_update(struct nft_regs_track *track,
+ const struct nft_expr *expr, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_update(track, expr, dreg, i);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_update);
+
+void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
+{
+ unsigned int regcount;
+ int i;
+
+ __nft_reg_track_clobber(track, dreg);
+
+ regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ __nft_reg_track_cancel(track, dreg);
+}
+EXPORT_SYMBOL_GPL(nft_reg_track_cancel);
+
+void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
+{
+ track->regs[dreg].selector = NULL;
+ track->regs[dreg].bitwise = NULL;
+ track->regs[dreg].num_reg = 0;
+}
+EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);
+
/*
* Tables
*/
static struct nft_table *nft_table_lookup(const struct net *net,
const struct nlattr *nla,
- u8 family, u8 genmask)
+ u8 family, u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry_rcu(table, &nft_net->tables, list,
+ lockdep_is_held(&nft_net->commit_mutex)) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
- nft_active_genmask(table, genmask))
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -439,14 +993,22 @@ static struct nft_table *nft_table_lookup(const struct net *net,
static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
const struct nlattr *nla,
- u8 genmask)
+ int family, u8 genmask, u32 nlpid)
{
+ struct nftables_pernet *nft_net;
struct nft_table *table;
- list_for_each_entry(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ list_for_each_entry(table, &nft_net->tables, list) {
if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
- nft_active_genmask(table, genmask))
+ table->family == family &&
+ nft_active_genmask(table, genmask)) {
+ if (nft_table_has_owner(table) &&
+ nlpid && table->nlpid != nlpid)
+ return ERR_PTR(-EPERM);
+
return table;
+ }
}
return ERR_PTR(-ENOENT);
@@ -460,53 +1022,82 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
static const struct nft_chain_type *
+__nft_chain_type_get(u8 family, enum nft_chain_types type)
+{
+ if (family >= NFPROTO_NUMPROTO ||
+ type >= NFT_CHAIN_T_MAX)
+ return NULL;
+
+ return chain_type[family][type];
+}
+
+static const struct nft_chain_type *
__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
{
+ const struct nft_chain_type *type;
int i;
for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
- if (chain_type[family][i] != NULL &&
- !nla_strcmp(nla, chain_type[family][i]->name))
- return chain_type[family][i];
+ type = __nft_chain_type_get(family, i);
+ if (!type)
+ continue;
+ if (!nla_strcmp(nla, type->name))
+ return type;
}
return NULL;
}
-/*
- * Loading a module requires dropping mutex that guards the
- * transaction.
- * We first need to abort any pending transactions as once
- * mutex is unlocked a different client could start a new
- * transaction. It must not see any 'future generation'
- * changes * as these changes will never happen.
- */
-#ifdef CONFIG_MODULES
-static int __nf_tables_abort(struct net *net);
+struct nft_module_request {
+ struct list_head list;
+ char module[MODULE_NAME_LEN];
+ bool done;
+};
-static void nft_request_module(struct net *net, const char *fmt, ...)
+#ifdef CONFIG_MODULES
+__printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
+ ...)
{
char module_name[MODULE_NAME_LEN];
+ struct nftables_pernet *nft_net;
+ struct nft_module_request *req;
va_list args;
int ret;
- __nf_tables_abort(net);
-
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
- if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
- return;
+ if (ret >= MODULE_NAME_LEN)
+ return 0;
+
+ nft_net = nft_pernet(net);
+ list_for_each_entry(req, &nft_net->module_list, list) {
+ if (!strcmp(req->module, module_name)) {
+ if (req->done)
+ return 0;
+
+ /* A request to load this module already exists. */
+ return -EAGAIN;
+ }
+ }
- mutex_unlock(&net->nft.commit_mutex);
- request_module("%s", module_name);
- mutex_lock(&net->nft.commit_mutex);
+ req = kmalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ req->done = false;
+ strscpy(req->module, module_name, MODULE_NAME_LEN);
+ list_add_tail(&req->list, &nft_net->module_list);
+
+ return -EAGAIN;
}
+EXPORT_SYMBOL_GPL(nft_request_module);
#endif
static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
- WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ if (debug_locks)
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}
@@ -523,21 +1114,32 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (autoload) {
- nft_request_module(net, "nft-chain-%u-%.*s", family,
- nla_len(nla), (const char *)nla_data(nla));
- type = __nf_tables_chain_type_lookup(nla, family);
- if (type != NULL)
+ if (nft_request_module(net, "nft-chain-%u-%.*s", family,
+ nla_len(nla),
+ (const char *)nla_data(nla)) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
return ERR_PTR(-ENOENT);
}
+static unsigned int nft_base_seq(const struct net *net)
+{
+ return READ_ONCE(net->nft.base_seq);
+}
+
+static __be16 nft_base_seq_be16(const struct net *net)
+{
+ return htons(nft_base_seq(net) & 0xffff);
+}
+
static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
[NFTA_TABLE_NAME] = { .type = NLA_STRING,
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
[NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN }
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -545,25 +1147,38 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
- nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELTABLE ||
+ event == NFT_MSG_DESTROYTABLE) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
+ htonl(table->flags & NFT_TABLE_F_MASK)))
+ goto nla_put_failure;
+
+ if (nft_table_has_owner(table) &&
+ nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
+ goto nla_put_failure;
+
+ if (table->udata) {
+ if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
+ goto nla_put_failure;
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -572,9 +1187,23 @@ nla_put_failure:
return -1;
}
+struct nftnl_skb_parms {
+ bool report;
+};
+#define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb))
+
+static void nft_notify_enqueue(struct sk_buff *skb, bool report,
+ struct list_head *notify_list)
+{
+ NFT_CB(skb).report = report;
+ list_add_tail(&skb->list, notify_list);
+}
+
static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -585,15 +1214,18 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table);
+ event, flags, ctx->family, ctx->table);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -603,15 +1235,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -657,28 +1291,27 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
}
/* called with rcu_read_lock held */
-static int nf_tables_gettable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_tables,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
return PTR_ERR(table);
@@ -689,14 +1322,14 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0,
- family, table);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
+ 0, family, table);
if (err < 0)
- goto err;
+ goto err_fill_table_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_table_info:
kfree_skb(skb2);
return err;
}
@@ -715,7 +1348,7 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
if (cnt && i++ == cnt)
break;
- nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+ nf_tables_unregister_hook(net, table, chain);
}
}
@@ -730,14 +1363,15 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table)
if (!nft_is_base_chain(chain))
continue;
- err = nf_register_net_hook(net, &nft_base_chain(chain)->ops);
+ err = nf_tables_register_hook(net, table, chain);
if (err < 0)
- goto err;
+ goto err_register_hooks;
i++;
}
return 0;
-err:
+
+err_register_hooks:
if (i)
nft_table_disable(net, table, i);
return err;
@@ -745,25 +1379,68 @@ err:
static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
+ table->flags &= ~NFT_TABLE_F_DORMANT;
nft_table_disable(net, table, 0);
+ table->flags |= NFT_TABLE_F_DORMANT;
+}
+
+#define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1)
+#define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0)
+#define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1)
+#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2)
+#define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \
+ __NFT_TABLE_F_WAS_AWAKEN | \
+ __NFT_TABLE_F_WAS_ORPHAN)
+
+static bool nft_table_pending_update(const struct nft_ctx *ctx)
+{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ struct nft_trans *trans;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return true;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->table == ctx->table &&
+ ((trans->msg_type == NFT_MSG_NEWCHAIN &&
+ nft_trans_chain_update(trans)) ||
+ (trans->msg_type == NFT_MSG_DELCHAIN &&
+ nft_is_base_chain(nft_trans_chain(trans)))))
+ return true;
+ }
+
+ return false;
}
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
u32 flags;
- int ret = 0;
+ int ret;
if (!ctx->nla[NFTA_TABLE_FLAGS])
return 0;
flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
- if (flags == ctx->table->flags)
+ if (flags == (ctx->table->flags & NFT_TABLE_F_MASK))
return 0;
+ if ((nft_table_has_owner(ctx->table) &&
+ !(flags & NFT_TABLE_F_OWNER)) ||
+ (flags & NFT_TABLE_F_OWNER &&
+ !nft_table_is_orphan(ctx->table)))
+ return -EOPNOTSUPP;
+
+ if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
+ return -EOPNOTSUPP;
+
+ /* No dormant off/on/off/on games in single transaction */
+ if (nft_table_pending_update(ctx))
+ return -EINVAL;
+
trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
sizeof(struct nft_trans_table));
if (trans == NULL)
@@ -771,22 +1448,35 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
+ ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ret = nf_tables_table_enable(ctx->net, ctx->table);
- if (ret >= 0) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
- nft_trans_table_enable(trans) = true;
+ ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
+ if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
+ ret = nf_tables_table_enable(ctx->net, ctx->table);
+ if (ret < 0)
+ goto err_register_hooks;
+
+ ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
}
}
- if (ret < 0)
- goto err;
+
+ if ((flags & NFT_TABLE_F_OWNER) &&
+ !nft_table_has_owner(ctx->table)) {
+ ctx->table->nlpid = ctx->portid;
+ ctx->table->flags |= NFT_TABLE_F_OWNER |
+ __NFT_TABLE_F_WAS_ORPHAN;
+ }
nft_trans_table_update(trans) = true;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
return 0;
-err:
+
+err_register_hooks:
+ ctx->table->flags |= NFT_TABLE_F_DORMANT;
nft_trans_destroy(trans);
return ret;
}
@@ -814,53 +1504,119 @@ static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
return strcmp(chain->name, name);
}
-static int nf_tables_newtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static u32 nft_objname_hash(const void *data, u32 len, u32 seed)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ const struct nft_object_hash_key *k = data;
+
+ seed ^= hash_ptr(k->table, 32);
+
+ return jhash(k->name, strlen(k->name), seed);
+}
+
+static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct nft_object *obj = data;
+
+ return nft_objname_hash(&obj->key, 0, seed);
+}
+
+static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct nft_object_hash_key *k = arg->key;
+ const struct nft_object *obj = ptr;
+
+ if (obj->key.table != k->table)
+ return -1;
+
+ return strcmp(obj->key.name, k->name);
+}
+
+static bool nft_supported_family(u8 family)
+{
+ return false
+#ifdef CONFIG_NF_TABLES_INET
+ || family == NFPROTO_INET
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ || family == NFPROTO_IPV4
+#endif
+#ifdef CONFIG_NF_TABLES_ARP
+ || family == NFPROTO_ARP
+#endif
+#ifdef CONFIG_NF_TABLES_NETDEV
+ || family == NFPROTO_NETDEV
+#endif
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ || family == NFPROTO_BRIDGE
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ || family == NFPROTO_IPV6
+#endif
+ ;
+}
+
+static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- u32 flags = 0;
struct nft_ctx ctx;
+ u32 flags = 0;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ if (!nft_supported_family(family))
+ return -EOPNOTSUPP;
+
+ lockdep_assert_held(&nft_net->commit_mutex);
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
if (PTR_ERR(table) != -ENOENT)
return PTR_ERR(table);
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nf_tables_updtable(&ctx);
}
if (nla[NFTA_TABLE_FLAGS]) {
flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
- if (flags & ~NFT_TABLE_F_DORMANT)
- return -EINVAL;
+ if (flags & ~NFT_TABLE_F_MASK)
+ return -EOPNOTSUPP;
}
err = -ENOMEM;
- table = kzalloc(sizeof(*table), GFP_KERNEL);
+ table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
if (table == NULL)
goto err_kzalloc;
- table->name = nla_strdup(attr, GFP_KERNEL);
+ table->validate_state = nft_net->validate_state;
+ table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (table->name == NULL)
goto err_strdup;
+ if (nla[NFTA_TABLE_USERDATA]) {
+ table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (table->udata == NULL)
+ goto err_table_udata;
+
+ table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
+ }
+
err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
if (err)
goto err_chain_ht;
@@ -871,18 +1627,22 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
- table->handle = ++table_handle;
+ table->handle = ++nft_net->table_handle;
+ if (table->flags & NFT_TABLE_F_OWNER)
+ table->nlpid = NETLINK_CB(skb).portid;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err_trans;
- list_add_tail_rcu(&table->list, &net->nft.tables);
+ list_add_tail_rcu(&table->list, &nft_net->tables);
return 0;
err_trans:
rhltable_destroy(&table->chains_ht);
err_chain_ht:
+ kfree(table->udata);
+err_table_udata:
kfree(table->name);
err_strdup:
kfree(table);
@@ -902,6 +1662,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delrule_by_chain(ctx);
@@ -913,8 +1676,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, set))
continue;
- if (nft_set_is_anonymous(set) &&
- !list_empty(&set->bindings))
+ if (nft_set_is_anonymous(set))
continue;
err = nft_delset(ctx, set);
@@ -923,12 +1685,18 @@ static int nft_flush_table(struct nft_ctx *ctx)
}
list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
+ if (!nft_is_active_next(ctx->net, flowtable))
+ continue;
+
err = nft_delflowtable(ctx, flowtable);
if (err < 0)
goto out;
}
list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+ if (!nft_is_active_next(ctx->net, obj))
+ continue;
+
err = nft_delobj(ctx, obj);
if (err < 0)
goto out;
@@ -938,6 +1706,9 @@ static int nft_flush_table(struct nft_ctx *ctx)
if (!nft_is_active_next(ctx->net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
+
ctx->chain = chain;
err = nft_delchain(ctx);
@@ -952,11 +1723,12 @@ out:
static int nft_flush(struct nft_ctx *ctx, int family)
{
- struct nft_table *table, *nt;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nlattr * const *nla = ctx->nla;
+ struct nft_table *table, *nt;
int err = 0;
- list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) {
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
if (family != AF_UNSPEC && table->family != family)
continue;
@@ -965,6 +1737,9 @@ static int nft_flush(struct nft_ctx *ctx, int family)
if (!nft_is_active_next(ctx->net, table))
continue;
+ if (nft_table_has_owner(table) && table->nlpid != ctx->portid)
+ continue;
+
if (nla[NFTA_TABLE_NAME] &&
nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
continue;
@@ -979,37 +1754,42 @@ out:
return err;
}
-static int nf_tables_deltable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
if (family == AF_UNSPEC ||
(!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
if (nla[NFTA_TABLE_HANDLE]) {
attr = nla[NFTA_TABLE_HANDLE];
- table = nft_table_lookup_byhandle(net, attr, genmask);
+ table = nft_table_lookup_byhandle(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
} else {
attr = nla[NFTA_TABLE_NAME];
- table = nft_table_lookup(net, attr, family, genmask);
+ table = nft_table_lookup(net, attr, family, genmask,
+ NETLINK_CB(skb).portid);
}
if (IS_ERR(table)) {
+ if (PTR_ERR(table) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(table);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
table->use > 0)
return -EBUSY;
@@ -1019,23 +1799,21 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk,
return nft_flush_table(&ctx);
}
-static void nf_tables_table_destroy(struct nft_ctx *ctx)
+static void nf_tables_table_destroy(struct nft_table *table)
{
- if (WARN_ON(ctx->table->use > 0))
+ if (WARN_ON(table->use > 0))
return;
- rhltable_destroy(&ctx->table->chains_ht);
- kfree(ctx->table->name);
- kfree(ctx->table);
+ rhltable_destroy(&table->chains_ht);
+ kfree(table->name);
+ kfree(table->udata);
+ kfree(table);
}
void nft_register_chain_type(const struct nft_chain_type *ctype)
{
- if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO))
- return;
-
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) {
+ if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) {
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
return;
}
@@ -1070,10 +1848,12 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
return ERR_PTR(-ENOENT);
}
-static bool lockdep_commit_lock_is_held(struct net *net)
+static bool lockdep_commit_lock_is_held(const struct net *net)
{
#ifdef CONFIG_PROVE_LOCKING
- return lockdep_is_held(&net->nft.commit_mutex);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ return lockdep_is_held(&nft_net->commit_mutex);
#else
return true;
#endif
@@ -1090,7 +1870,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- nla_strlcpy(search, nla, sizeof(search));
+ nla_strscpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
!lockdep_commit_lock_is_held(net));
@@ -1119,8 +1899,13 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_CHAIN_HOOK] = { .type = NLA_NESTED },
[NFTA_CHAIN_POLICY] = { .type = NLA_U32 },
- [NFTA_CHAIN_TYPE] = { .type = NLA_STRING },
+ [NFTA_CHAIN_TYPE] = { .type = NLA_STRING,
+ .len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED },
+ [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 },
+ [NFTA_CHAIN_ID] = { .type = NLA_U32 },
+ [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -1138,18 +1923,21 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
u64 pkts, bytes;
int cpu;
+ if (!stats)
+ return 0;
+
memset(&total, 0, sizeof(total));
for_each_possible_cpu(cpu) {
cpu_stats = per_cpu_ptr(stats, cpu);
do {
- seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ seq = u64_stats_fetch_begin(&cpu_stats->syncp);
pkts = cpu_stats->pkts;
bytes = cpu_stats->bytes;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
total.pkts += pkts;
total.bytes += bytes;
}
- nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+ nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS);
if (nest == NULL)
goto nla_put_failure;
@@ -1166,48 +1954,100 @@ nla_put_failure:
return -ENOSPC;
}
+static bool hook_is_prefix(struct nft_hook *hook)
+{
+ return strlen(hook->ifname) >= hook->ifnamelen;
+}
+
+static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook)
+{
+ int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME;
+
+ return nla_put_string(skb, attr, hook->ifname);
+}
+
+static int nft_dump_basechain_hook(struct sk_buff *skb,
+ const struct net *net, int family,
+ const struct nft_base_chain *basechain,
+ const struct list_head *hook_list)
+{
+ const struct nf_hook_ops *ops = &basechain->ops;
+ struct nft_hook *hook, *first = NULL;
+ struct nlattr *nest, *nest_devs;
+ int n = 0;
+
+ nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
+ if (nest == NULL)
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+ goto nla_put_failure;
+
+ if (nft_base_chain_netdev(family, ops->hooknum)) {
+ nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
+ if (!nest_devs)
+ goto nla_put_failure;
+
+ if (!hook_list)
+ hook_list = &basechain->hook_list;
+
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
+ if (!first)
+ first = hook;
+
+ if (nft_nla_put_hook_dev(skb, hook))
+ goto nla_put_failure;
+ n++;
+ }
+ nla_nest_end(skb, nest_devs);
+
+ if (n == 1 &&
+ !hook_is_prefix(first) &&
+ nla_put_string(skb, NFTA_HOOK_DEV, first->ifname))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+nla_put_failure:
+ return -1;
+}
+
static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event, u32 flags,
int family, const struct nft_table *table,
- const struct nft_chain *chain)
+ const struct nft_chain *chain,
+ const struct list_head *hook_list)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
- if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name))
- goto nla_put_failure;
- if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
+ if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) ||
+ nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) ||
+ nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
NFTA_CHAIN_PAD))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name))
- goto nla_put_failure;
+
+ if (!hook_list &&
+ (event == NFT_MSG_DELCHAIN ||
+ event == NFT_MSG_DESTROYCHAIN)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
if (nft_is_base_chain(chain)) {
const struct nft_base_chain *basechain = nft_base_chain(chain);
- const struct nf_hook_ops *ops = &basechain->ops;
- struct nlattr *nest;
+ struct nft_stats __percpu *stats;
- nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
- if (nest == NULL)
- goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
- goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
- goto nla_put_failure;
- if (basechain->dev_name[0] &&
- nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name))
+ if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list))
goto nla_put_failure;
- nla_nest_end(skb, nest);
if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
htonl(basechain->policy)))
@@ -1216,14 +2056,23 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
goto nla_put_failure;
- if (rcu_access_pointer(basechain->stats) &&
- nft_dump_stats(skb, rcu_dereference(basechain->stats)))
+ stats = rcu_dereference_check(basechain->stats,
+ lockdep_commit_lock_is_held(net));
+ if (nft_dump_stats(skb, stats))
goto nla_put_failure;
}
+ if (chain->flags &&
+ nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags)))
+ goto nla_put_failure;
+
if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
goto nla_put_failure;
+ if (chain->udata &&
+ nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1232,9 +2081,12 @@ nla_put_failure:
return -1;
}
-static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
+ const struct list_head *hook_list)
{
+ struct nftables_pernet *nft_net;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -1245,16 +2097,19 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
- ctx->chain);
+ event, flags, ctx->family, ctx->table,
+ ctx->chain, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_net = nft_pernet(ctx->net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -1264,16 +2119,18 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- const struct nft_chain *chain;
unsigned int idx = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ const struct nft_chain *chain;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -1291,7 +2148,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
NFT_MSG_NEWCHAIN,
NLM_F_MULTI,
table->family, table,
- chain) < 0)
+ chain, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1306,29 +2163,28 @@ done:
}
/* called with rcu_read_lock held */
-static int nf_tables_getchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_chains,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -1345,14 +2201,14 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
return -ENOMEM;
err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0,
- family, table, chain);
+ info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
+ 0, family, table, chain, NULL);
if (err < 0)
- goto err;
+ goto err_fill_chain_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_chain_info:
kfree_skb(skb2);
return err;
}
@@ -1369,17 +2225,17 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
struct nft_stats *stats;
int err;
- err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr,
+ nft_counter_policy, NULL);
if (err < 0)
- return ERR_PTR(err);
+ return ERR_PTR_PCPU(err);
if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
- return ERR_PTR(-EINVAL);
+ return ERR_PTR_PCPU(-EINVAL);
newstats = netdev_alloc_pcpu_stats(struct nft_stats);
if (newstats == NULL)
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR_PCPU(-ENOMEM);
/* Restore old counters on this cpu, no problem. Per-cpu statistics
* are not exposed to userspace.
@@ -1393,44 +2249,40 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
return newstats;
}
-static void nft_chain_stats_replace(struct net *net,
- struct nft_base_chain *chain,
- struct nft_stats __percpu *newstats)
+static void nft_chain_stats_replace(struct nft_trans_chain *trans)
{
- struct nft_stats __percpu *oldstats;
+ const struct nft_trans *t = &trans->nft_trans_binding.nft_trans;
+ struct nft_base_chain *chain = nft_base_chain(trans->chain);
- if (newstats == NULL)
+ if (!trans->stats)
return;
- if (rcu_access_pointer(chain->stats)) {
- oldstats = rcu_dereference_protected(chain->stats,
- lockdep_commit_lock_is_held(net));
- rcu_assign_pointer(chain->stats, newstats);
- synchronize_rcu();
- free_percpu(oldstats);
- } else {
- rcu_assign_pointer(chain->stats, newstats);
+ trans->stats =
+ rcu_replace_pointer(chain->stats, trans->stats,
+ lockdep_commit_lock_is_held(t->net));
+
+ if (!trans->stats)
static_branch_inc(&nft_counters_enabled);
- }
}
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
- struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
- struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
+ struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
+ struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);
if (g0 != g1)
kvfree(g1);
kvfree(g0);
/* should be NULL either via abort or via successful commit */
- WARN_ON_ONCE(chain->rules_next);
- kvfree(chain->rules_next);
+ WARN_ON_ONCE(chain->blob_next);
+ kvfree(chain->blob_next);
}
-static void nf_tables_chain_destroy(struct nft_ctx *ctx)
+void nf_tables_chain_destroy(struct nft_chain *chain)
{
- struct nft_chain *chain = ctx->chain;
+ const struct nft_table *table = chain->table;
+ struct nft_hook *hook, *next;
if (WARN_ON(chain->use > 0))
return;
@@ -1441,87 +2293,266 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ list_for_each_entry_safe(hook, next,
+ &basechain->hook_list, list) {
+ list_del_rcu(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ }
module_put(basechain->type->owner);
if (rcu_access_pointer(basechain->stats)) {
static_branch_dec(&nft_counters_enabled);
free_percpu(rcu_dereference_raw(basechain->stats));
}
kfree(chain->name);
+ kfree(chain->udata);
kfree(basechain);
} else {
kfree(chain->name);
+ kfree(chain->udata);
kfree(chain);
}
}
+static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
+ const struct nlattr *attr,
+ bool prefix)
+{
+ struct nf_hook_ops *ops;
+ struct net_device *dev;
+ struct nft_hook *hook;
+ int err;
+
+ hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
+ if (!hook)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&hook->ops_list);
+
+ err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
+ if (err < 0)
+ goto err_hook_free;
+
+ /* include the terminating NUL-char when comparing non-prefixes */
+ hook->ifnamelen = strlen(hook->ifname) + !prefix;
+
+ /* nf_tables_netdev_event() is called under rtnl_mutex, this is
+ * indirectly serializing all the other holders of the commit_mutex with
+ * the rtnl_mutex.
+ */
+ for_each_netdev(net, dev) {
+ if (strncmp(dev->name, hook->ifname, hook->ifnamelen))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
+ err = -ENOMEM;
+ goto err_hook_free;
+ }
+ ops->dev = dev;
+ list_add_tail(&ops->list, &hook->ops_list);
+ }
+ return hook;
+
+err_hook_free:
+ nft_netdev_hook_free(hook);
+ return ERR_PTR(err);
+}
+
+static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
+ const struct nft_hook *this)
+{
+ struct nft_hook *hook;
+
+ list_for_each_entry(hook, hook_list, list) {
+ if (!strncmp(hook->ifname, this->ifname,
+ min(hook->ifnamelen, this->ifnamelen)))
+ return hook;
+ }
+
+ return NULL;
+}
+
+static int nf_tables_parse_netdev_hooks(struct net *net,
+ const struct nlattr *attr,
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack)
+{
+ struct nft_hook *hook, *next;
+ const struct nlattr *tmp;
+ int rem, n = 0, err;
+ bool prefix;
+
+ nla_for_each_nested(tmp, attr, rem) {
+ switch (nla_type(tmp)) {
+ case NFTA_DEVICE_NAME:
+ prefix = false;
+ break;
+ case NFTA_DEVICE_PREFIX:
+ prefix = true;
+ break;
+ default:
+ err = -EINVAL;
+ goto err_hook;
+ }
+
+ hook = nft_netdev_hook_alloc(net, tmp, prefix);
+ if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tmp);
+ err = PTR_ERR(hook);
+ goto err_hook;
+ }
+ if (nft_hook_list_find(hook_list, hook)) {
+ NL_SET_BAD_ATTR(extack, tmp);
+ nft_netdev_hook_free(hook);
+ err = -EEXIST;
+ goto err_hook;
+ }
+ list_add_tail(&hook->list, hook_list);
+ n++;
+
+ if (n == NFT_NETDEVICE_MAX) {
+ err = -EFBIG;
+ goto err_hook;
+ }
+ }
+
+ return 0;
+
+err_hook:
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_del(&hook->list);
+ nft_netdev_hook_free(hook);
+ }
+ return err;
+}
+
struct nft_chain_hook {
u32 num;
s32 priority;
const struct nft_chain_type *type;
- struct net_device *dev;
+ struct list_head list;
};
+static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
+ struct list_head *hook_list,
+ struct netlink_ext_ack *extack, u32 flags)
+{
+ struct nft_hook *hook;
+ int err;
+
+ if (tb[NFTA_HOOK_DEV]) {
+ hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false);
+ if (IS_ERR(hook)) {
+ NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
+ return PTR_ERR(hook);
+ }
+
+ list_add_tail(&hook->list, hook_list);
+ } else if (tb[NFTA_HOOK_DEVS]) {
+ err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
+ hook_list, extack);
+ if (err < 0)
+ return err;
+
+ }
+
+ if (flags & NFT_CHAIN_HW_OFFLOAD &&
+ list_empty(hook_list))
+ return -EINVAL;
+
+ return 0;
+}
+
static int nft_chain_parse_hook(struct net *net,
+ struct nft_base_chain *basechain,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- bool autoload)
+ u32 flags, struct netlink_ext_ack *extack)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
- struct net_device *dev;
int err;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
- err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
- nft_hook_policy, NULL);
+ err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX,
+ nla[NFTA_CHAIN_HOOK],
+ nft_hook_policy, NULL);
if (err < 0)
return err;
- if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
- ha[NFTA_HOOK_PRIORITY] == NULL)
- return -EINVAL;
+ if (!basechain) {
+ if (!ha[NFTA_HOOK_HOOKNUM] ||
+ !ha[NFTA_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
+ return -ENOENT;
+ }
- hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
- hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
- type = chain_type[family][NFT_CHAIN_T_DEFAULT];
- if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
- family, autoload);
- if (IS_ERR(type))
- return PTR_ERR(type);
- }
- if (!(type->hook_mask & (1 << hook->num)))
- return -EOPNOTSUPP;
+ type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+ if (!type)
+ return -EOPNOTSUPP;
- if (type->type == NFT_CHAIN_T_NAT &&
- hook->priority <= NF_IP_PRI_CONNTRACK)
- return -EOPNOTSUPP;
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+ family, true);
+ if (IS_ERR(type)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return PTR_ERR(type);
+ }
+ }
+ if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+ return -EOPNOTSUPP;
- if (!try_module_get(type->owner))
- return -ENOENT;
+ if (type->type == NFT_CHAIN_T_NAT &&
+ hook->priority <= NF_IP_PRI_CONNTRACK)
+ return -EOPNOTSUPP;
+ } else {
+ if (ha[NFTA_HOOK_HOOKNUM]) {
+ hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+ if (hook->num != basechain->ops.hooknum)
+ return -EOPNOTSUPP;
+ }
+ if (ha[NFTA_HOOK_PRIORITY]) {
+ hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+ if (hook->priority != basechain->ops.priority)
+ return -EOPNOTSUPP;
+ }
- hook->type = type;
+ if (nla[NFTA_CHAIN_TYPE]) {
+ type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+ family);
+ if (!type) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return -ENOENT;
+ }
+ } else {
+ type = basechain->type;
+ }
+ }
- hook->dev = NULL;
- if (family == NFPROTO_NETDEV) {
- char ifname[IFNAMSIZ];
+ if (!try_module_get(type->owner)) {
+ if (nla[NFTA_CHAIN_TYPE])
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+ return -ENOENT;
+ }
- if (!ha[NFTA_HOOK_DEV]) {
- module_put(type->owner);
- return -EOPNOTSUPP;
- }
+ hook->type = type;
- nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
- dev = __dev_get_by_name(net, ifname);
- if (!dev) {
+ INIT_LIST_HEAD(&hook->list);
+ if (nft_base_chain_netdev(family, hook->num)) {
+ err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags);
+ if (err < 0) {
module_put(type->owner);
- return -ENOENT;
+ return err;
}
- hook->dev = dev;
- } else if (ha[NFTA_HOOK_DEV]) {
+ } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) {
module_put(type->owner);
return -EOPNOTSUPP;
}
@@ -1531,175 +2562,331 @@ static int nft_chain_parse_hook(struct net *net,
static void nft_chain_release_hook(struct nft_chain_hook *hook)
{
+ struct nft_hook *h, *next;
+
+ list_for_each_entry_safe(h, next, &hook->list, list) {
+ list_del(&h->list);
+ nft_netdev_hook_free(h);
+ }
module_put(hook->type->owner);
}
-struct nft_rules_old {
- struct rcu_head h;
- struct nft_rule **start;
-};
+static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
+{
+ struct nft_rule_dp_last *lrule;
+
+ BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0);
+
+ lrule = (struct nft_rule_dp_last *)ptr;
+ lrule->end.is_last = 1;
+ lrule->chain = chain;
+ /* blob size does not include the trailer rule */
+}
-static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
- unsigned int alloc)
+static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain,
+ unsigned int size)
{
- if (alloc > INT_MAX)
+ struct nft_rule_blob *blob;
+
+ if (size > INT_MAX)
return NULL;
- alloc += 1; /* NULL, ends rules */
- if (sizeof(struct nft_rule *) > INT_MAX / alloc)
+ size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last);
+
+ blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!blob)
return NULL;
- alloc *= sizeof(struct nft_rule *);
- alloc += sizeof(struct nft_rules_old);
+ blob->size = 0;
+ nft_last_rule(chain, blob->data);
+
+ return blob;
+}
+
+static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
+ const struct nft_chain_hook *hook,
+ struct nft_chain *chain)
+{
+ ops->pf = family;
+ ops->hooknum = hook->num;
+ ops->priority = hook->priority;
+ ops->priv = chain;
+ ops->hook = hook->type->hooks[ops->hooknum];
+ ops->hook_ops_type = NF_HOOK_OP_NF_TABLES;
+}
+
+static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
+ struct nft_chain_hook *hook, u32 flags)
+{
+ struct nft_chain *chain;
+ struct nf_hook_ops *ops;
+ struct nft_hook *h;
+
+ basechain->type = hook->type;
+ INIT_LIST_HEAD(&basechain->hook_list);
+ chain = &basechain->chain;
+
+ if (nft_base_chain_netdev(family, hook->num)) {
+ list_splice_init(&hook->list, &basechain->hook_list);
+ list_for_each_entry(h, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nft_basechain_hook_init(ops, family, hook, chain);
+ }
+ }
+ nft_basechain_hook_init(&basechain->ops, family, hook, chain);
+
+ chain->flags |= NFT_CHAIN_BASE | flags;
+ basechain->policy = NF_ACCEPT;
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
+ !nft_chain_offload_support(basechain)) {
+ list_splice_init(&basechain->hook_list, &hook->list);
+ return -EOPNOTSUPP;
+ }
- return kvmalloc(alloc, GFP_KERNEL);
+ flow_block_init(&basechain->flow_block);
+
+ return 0;
}
-static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy)
+int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
+{
+ int err;
+
+ err = rhltable_insert_key(&table->chains_ht, chain->name,
+ &chain->rhlhead, nft_chain_ht_params);
+ if (err)
+ return err;
+
+ list_add_tail_rcu(&chain->list, &table->chains);
+
+ return 0;
+}
+
+static u64 chain_id;
+
+static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy,
+ u32 flags, struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
struct nft_base_chain *basechain;
- struct nft_stats __percpu *stats;
struct net *net = ctx->net;
+ char name[NFT_NAME_MAXLEN];
+ struct nft_rule_blob *blob;
+ struct nft_trans *trans;
struct nft_chain *chain;
- struct nft_rule **rules;
int err;
- if (table->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_CHAIN_HOOK]) {
- struct nft_chain_hook hook;
- struct nf_hook_ops *ops;
+ struct nft_stats __percpu *stats = NULL;
+ struct nft_chain_hook hook = {};
+
+ if (table->flags & __NFT_TABLE_F_UPDATE)
+ return -EINVAL;
- err = nft_chain_parse_hook(net, nla, &hook, family, true);
+ if (flags & NFT_CHAIN_BINDING)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags,
+ extack);
if (err < 0)
return err;
- basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+ basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
if (basechain == NULL) {
nft_chain_release_hook(&hook);
return -ENOMEM;
}
-
- if (hook.dev != NULL)
- strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
+ chain = &basechain->chain;
if (nla[NFTA_CHAIN_COUNTERS]) {
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats)) {
+ if (IS_ERR_PCPU(stats)) {
nft_chain_release_hook(&hook);
kfree(basechain);
- return PTR_ERR(stats);
+ return PTR_ERR_PCPU(stats);
}
rcu_assign_pointer(basechain->stats, stats);
- static_branch_inc(&nft_counters_enabled);
}
- basechain->type = hook.type;
- chain = &basechain->chain;
-
- ops = &basechain->ops;
- ops->pf = family;
- ops->hooknum = hook.num;
- ops->priority = hook.priority;
- ops->priv = chain;
- ops->hook = hook.type->hooks[ops->hooknum];
- ops->dev = hook.dev;
-
- chain->flags |= NFT_BASE_CHAIN;
- basechain->policy = policy;
+ err = nft_basechain_init(basechain, family, &hook, flags);
+ if (err < 0) {
+ nft_chain_release_hook(&hook);
+ kfree(basechain);
+ free_percpu(stats);
+ return err;
+ }
+ if (stats)
+ static_branch_inc(&nft_counters_enabled);
} else {
- chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+ if (flags & NFT_CHAIN_BASE)
+ return -EINVAL;
+ if (flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
+
+ chain->flags = flags;
}
ctx->chain = chain;
INIT_LIST_HEAD(&chain->rules);
chain->handle = nf_tables_alloc_handle(table);
chain->table = table;
- chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+
+ if (nla[NFTA_CHAIN_NAME]) {
+ chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
+ } else {
+ if (!(flags & NFT_CHAIN_BINDING)) {
+ err = -EINVAL;
+ goto err_destroy_chain;
+ }
+
+ snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
+ chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
+ }
+
if (!chain->name) {
err = -ENOMEM;
- goto err1;
+ goto err_destroy_chain;
}
- rules = nf_tables_chain_alloc_rules(chain, 0);
- if (!rules) {
- err = -ENOMEM;
- goto err1;
+ if (nla[NFTA_CHAIN_USERDATA]) {
+ chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (chain->udata == NULL) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
+ chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
- *rules = NULL;
- rcu_assign_pointer(chain->rules_gen_0, rules);
- rcu_assign_pointer(chain->rules_gen_1, rules);
+ blob = nf_tables_chain_alloc_rules(chain, 0);
+ if (!blob) {
+ err = -ENOMEM;
+ goto err_destroy_chain;
+ }
- err = nf_tables_register_hook(net, table, chain);
- if (err < 0)
- goto err1;
+ RCU_INIT_POINTER(chain->blob_gen_0, blob);
+ RCU_INIT_POINTER(chain->blob_gen_1, blob);
- err = rhltable_insert_key(&table->chains_ht, chain->name,
- &chain->rhlhead, nft_chain_ht_params);
- if (err)
- goto err2;
+ if (!nft_use_inc(&table->use)) {
+ err = -EMFILE;
+ goto err_destroy_chain;
+ }
- err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
- if (err < 0) {
- rhltable_remove(&table->chains_ht, &chain->rhlhead,
- nft_chain_ht_params);
- goto err2;
+ trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto err_trans;
}
- table->use++;
- list_add_tail_rcu(&chain->list, &table->chains);
+ nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
+ if (nft_is_base_chain(chain))
+ nft_trans_chain_policy(trans) = policy;
+
+ err = nft_chain_add(table, chain);
+ if (err < 0)
+ goto err_chain_add;
+
+ /* This must be LAST to ensure no packets are walking over this chain. */
+ err = nf_tables_register_hook(net, table, chain);
+ if (err < 0)
+ goto err_register_hook;
return 0;
-err2:
- nf_tables_unregister_hook(net, table, chain);
-err1:
- nf_tables_chain_destroy(ctx);
+
+err_register_hook:
+ nft_chain_del(chain);
+err_chain_add:
+ nft_trans_destroy(trans);
+err_trans:
+ nft_use_dec_restore(&table->use);
+err_destroy_chain:
+ nf_tables_chain_destroy(chain);
return err;
}
-static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
+static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
+ u32 flags, const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
+ struct nft_base_chain *basechain = NULL;
struct nft_table *table = ctx->table;
struct nft_chain *chain = ctx->chain;
- struct nft_base_chain *basechain;
- struct nft_stats *stats = NULL;
- struct nft_chain_hook hook;
+ struct nft_chain_hook hook = {};
+ struct nft_stats __percpu *stats = NULL;
+ struct nftables_pernet *nft_net;
+ struct nft_hook *h, *next;
struct nf_hook_ops *ops;
struct nft_trans *trans;
+ bool unregister = false;
int err;
+ if (chain->flags ^ flags)
+ return -EOPNOTSUPP;
+
+ INIT_LIST_HEAD(&hook.list);
+
if (nla[NFTA_CHAIN_HOOK]) {
- if (!nft_is_base_chain(chain))
- return -EBUSY;
+ if (!nft_is_base_chain(chain)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
+ }
- err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- false);
+ basechain = nft_base_chain(chain);
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
+ ctx->family, flags, extack);
if (err < 0)
return err;
- basechain = nft_base_chain(chain);
if (basechain->type != hook.type) {
nft_chain_release_hook(&hook);
- return -EBUSY;
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
}
- ops = &basechain->ops;
- if (ops->hooknum != hook.num ||
- ops->priority != hook.priority ||
- ops->dev != hook.dev) {
- nft_chain_release_hook(&hook);
- return -EBUSY;
+ if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ list_for_each_entry(ops, &h->ops_list, list) {
+ ops->pf = basechain->ops.pf;
+ ops->hooknum = basechain->ops.hooknum;
+ ops->priority = basechain->ops.priority;
+ ops->priv = basechain->ops.priv;
+ ops->hook = basechain->ops.hook;
+ }
+
+ if (nft_hook_list_find(&basechain->hook_list, h)) {
+ list_del(&h->list);
+ nft_netdev_hook_free(h);
+ continue;
+ }
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWCHAIN ||
+ trans->table != ctx->table ||
+ !nft_trans_chain_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) {
+ nft_chain_release_hook(&hook);
+ return -EEXIST;
+ }
+ }
+ }
+ } else {
+ ops = &basechain->ops;
+ if (ops->hooknum != hook.num ||
+ ops->priority != hook.priority) {
+ nft_chain_release_hook(&hook);
+ NL_SET_BAD_ATTR(extack, attr);
+ return -EEXIST;
+ }
}
- nft_chain_release_hook(&hook);
}
if (nla[NFTA_CHAIN_HANDLE] &&
@@ -1708,24 +2895,52 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
chain2 = nft_chain_lookup(ctx->net, table,
nla[NFTA_CHAIN_NAME], genmask);
- if (!IS_ERR(chain2))
- return -EEXIST;
+ if (!IS_ERR(chain2)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
+ err = -EEXIST;
+ goto err_hooks;
+ }
+ }
+
+ if (table->flags & __NFT_TABLE_F_UPDATE &&
+ !list_empty(&hook.list)) {
+ NL_SET_BAD_ATTR(extack, attr);
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nft_is_base_chain(chain) &&
+ !list_empty(&hook.list)) {
+ basechain = nft_base_chain(chain);
+ ops = &basechain->ops;
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+ err = nft_netdev_register_hooks(ctx->net, &hook.list);
+ if (err < 0)
+ goto err_hooks;
+
+ unregister = true;
+ }
}
if (nla[NFTA_CHAIN_COUNTERS]) {
- if (!nft_is_base_chain(chain))
- return -EOPNOTSUPP;
+ if (!nft_is_base_chain(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_hooks;
+ }
stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
- if (IS_ERR(stats))
- return PTR_ERR(stats);
+ if (IS_ERR_PCPU(stats)) {
+ err = PTR_ERR_PCPU(stats);
+ goto err_hooks;
+ }
}
err = -ENOMEM;
- trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
- sizeof(struct nft_trans_chain));
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN);
if (trans == NULL)
- goto err;
+ goto err_trans;
nft_trans_chain_stats(trans) = stats;
nft_trans_chain_update(trans) = true;
@@ -1737,55 +2952,98 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
if (nla[NFTA_CHAIN_HANDLE] &&
nla[NFTA_CHAIN_NAME]) {
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_trans *tmp;
char *name;
err = -ENOMEM;
- name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
if (!name)
- goto err;
+ goto err_trans;
err = -EEXIST;
- list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
+ list_for_each_entry(tmp, &nft_net->commit_list, list) {
if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
- tmp->ctx.table == table &&
+ tmp->table == table &&
nft_trans_chain_update(tmp) &&
nft_trans_chain_name(tmp) &&
strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
kfree(name);
- goto err;
+ goto err_trans;
}
}
nft_trans_chain_name(trans) = name;
}
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+ nft_trans_basechain(trans) = basechain;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&hook.list, &nft_trans_chain_hooks(trans));
+ if (nla[NFTA_CHAIN_HOOK])
+ module_put(hook.type->owner);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
return 0;
-err:
+
+err_trans:
free_percpu(stats);
kfree(trans);
+err_hooks:
+ if (nla[NFTA_CHAIN_HOOK]) {
+ list_for_each_entry_safe(h, next, &hook.list, list) {
+ if (unregister) {
+ list_for_each_entry(ops, &h->ops_list, list)
+ nf_unregister_net_hook(ctx->net, ops);
+ }
+ list_del(&h->list);
+ nft_netdev_hook_free_rcu(h);
+ }
+ module_put(hook.type->owner);
+ }
+
return err;
}
-static int nf_tables_newchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
+ const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans *trans;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type == NFT_MSG_NEWCHAIN &&
+ nft_trans_chain(trans)->table == table &&
+ id == nft_trans_chain_id(trans) &&
+ nft_active_genmask(nft_trans_chain(trans), genmask))
+ return nft_trans_chain(trans);
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_chain *chain = NULL;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
- struct nft_chain *chain;
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
+ u32 flags = 0;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -1802,7 +3060,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
return PTR_ERR(chain);
}
attr = nla[NFTA_CHAIN_HANDLE];
- } else {
+ } else if (nla[NFTA_CHAIN_NAME]) {
chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
@@ -1811,6 +3069,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
chain = NULL;
}
+ } else if (!nla[NFTA_CHAIN_ID]) {
+ return -EINVAL;
}
if (nla[NFTA_CHAIN_POLICY]) {
@@ -1836,30 +3096,94 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ if (nla[NFTA_CHAIN_FLAGS])
+ flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
+ else if (chain)
+ flags = chain->flags;
+
+ if (flags & ~NFT_CHAIN_FLAGS)
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain != NULL) {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, attr);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return nf_tables_updchain(&ctx, genmask, policy);
+ flags |= chain->flags & NFT_CHAIN_BASE;
+ return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
+ extack);
}
- return nf_tables_addchain(&ctx, family, genmask, policy);
+ return nf_tables_addchain(&ctx, family, policy, flags, extack);
}
-static int nf_tables_delchain(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nft_delchain_hook(struct nft_ctx *ctx,
+ struct nft_base_chain *basechain,
+ struct netlink_ext_ack *extack)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ const struct nft_chain *chain = &basechain->chain;
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_chain_hook chain_hook = {};
+ struct nft_hook *this, *hook;
+ LIST_HEAD(chain_del_list);
+ struct nft_trans *trans;
+ int err;
+
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return -EOPNOTSUPP;
+
+ err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook,
+ ctx->family, chain->flags, extack);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry(this, &chain_hook.list, list) {
+ hook = nft_hook_list_find(&basechain->hook_list, this);
+ if (!hook) {
+ err = -ENOENT;
+ goto err_chain_del_hook;
+ }
+ list_move(&hook->list, &chain_del_list);
+ }
+
+ trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_chain_del_hook;
+ }
+
+ nft_trans_basechain(trans) = basechain;
+ nft_trans_chain_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+ list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
+ nft_chain_release_hook(&chain_hook);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_chain_del_hook:
+ list_splice(&chain_del_list, &basechain->hook_list);
+ nft_chain_release_hook(&chain_hook);
+
+ return err;
+}
+
+static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_chain *chain;
@@ -1869,7 +3193,8 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
u32 use;
int err;
- table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
return PTR_ERR(table);
@@ -1884,16 +3209,36 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup(net, table, attr, genmask);
}
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(chain);
}
- if (nlh->nlmsg_flags & NLM_F_NONREC &&
+ if (nft_chain_binding(chain))
+ return -EOPNOTSUPP;
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
+
+ if (nla[NFTA_CHAIN_HOOK]) {
+ if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN ||
+ chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ return -EOPNOTSUPP;
+
+ if (nft_is_base_chain(chain)) {
+ struct nft_base_chain *basechain = nft_base_chain(chain);
+
+ if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
+ return nft_delchain_hook(&ctx, basechain, extack);
+ }
+ }
+
+ if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
-
use = chain->use;
list_for_each_entry(rule, &chain->rules, list) {
if (!nft_is_active_next(net, rule))
@@ -1922,15 +3267,15 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
/**
* nft_register_expr - register nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Registers the expr type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
*/
int nft_register_expr(struct nft_expr_type *type)
{
- if (!nft_expr_check_ops(type->ops))
- return -EINVAL;
+ if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
+ return -ENOMEM;
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
@@ -1944,7 +3289,7 @@ EXPORT_SYMBOL_GPL(nft_register_expr);
/**
* nft_unregister_expr - unregister nf_tables expr type
- * @ops: expr type
+ * @type: expr type
*
* Unregisters the expr typefor use with nf_tables.
*/
@@ -1959,15 +3304,30 @@ EXPORT_SYMBOL_GPL(nft_unregister_expr);
static const struct nft_expr_type *__nft_expr_type_get(u8 family,
struct nlattr *nla)
{
- const struct nft_expr_type *type;
+ const struct nft_expr_type *type, *candidate = NULL;
- list_for_each_entry(type, &nf_tables_expressions, list) {
- if (!nla_strcmp(nla, type->name) &&
- (!type->family || type->family == family))
- return type;
+ list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
+ if (!nla_strcmp(nla, type->name)) {
+ if (!type->family && !candidate)
+ candidate = type;
+ else if (type->family == family)
+ candidate = type;
+ }
}
- return NULL;
+ return candidate;
+}
+
+#ifdef CONFIG_MODULES
+static int nft_expr_type_request_module(struct net *net, u8 family,
+ struct nlattr *nla)
+{
+ if (nft_request_module(net, "nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla)) == -EAGAIN)
+ return -EAGAIN;
+
+ return 0;
}
+#endif
static const struct nft_expr_type *nft_expr_type_get(struct net *net,
u8 family,
@@ -1978,21 +3338,23 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
+ rcu_read_lock();
type = __nft_expr_type_get(family, nla);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- if (__nft_expr_type_get(family, nla))
+ if (nft_expr_type_request_module(net, family, nla) == -EAGAIN)
return ERR_PTR(-EAGAIN);
- nft_request_module(net, "nft-expr-%.*s",
- nla_len(nla), (char *)nla_data(nla));
- if (__nft_expr_type_get(family, nla))
+ if (nft_request_module(net, "nft-expr-%.*s",
+ nla_len(nla),
+ (char *)nla_data(nla)) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
@@ -2000,21 +3362,23 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
}
static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
- [NFTA_EXPR_NAME] = { .type = NLA_STRING },
+ [NFTA_EXPR_NAME] = { .type = NLA_STRING,
+ .len = NFT_MODULE_AUTOLOAD_LIMIT },
[NFTA_EXPR_DATA] = { .type = NLA_NESTED },
};
static int nf_tables_fill_expr_info(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
goto nla_put_failure;
if (expr->ops->dump) {
- struct nlattr *data = nla_nest_start(skb, NFTA_EXPR_DATA);
+ struct nlattr *data = nla_nest_start_noflag(skb,
+ NFTA_EXPR_DATA);
if (data == NULL)
goto nla_put_failure;
- if (expr->ops->dump(skb, expr) < 0)
+ if (expr->ops->dump(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, data);
}
@@ -2026,14 +3390,14 @@ nla_put_failure:
};
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, attr);
+ nest = nla_nest_start_noflag(skb, attr);
if (!nest)
goto nla_put_failure;
- if (nf_tables_fill_expr_info(skb, expr) < 0)
+ if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return 0;
@@ -2044,6 +3408,7 @@ nla_put_failure:
struct nft_expr_info {
const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
};
@@ -2056,7 +3421,8 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
struct nlattr *tb[NFTA_EXPR_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+ nft_expr_policy, NULL);
if (err < 0)
return err;
@@ -2065,8 +3431,9 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
return PTR_ERR(type);
if (tb[NFTA_EXPR_DATA]) {
- err = nla_parse_nested(info->tb, type->maxattr,
- tb[NFTA_EXPR_DATA], type->policy, NULL);
+ err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA],
+ type->policy, NULL);
if (err < 0)
goto err1;
} else
@@ -2077,16 +3444,21 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
(const struct nlattr * const *)info->tb);
if (IS_ERR(ops)) {
err = PTR_ERR(ops);
- goto err1;
- }
- if (!nft_expr_check_ops(ops)) {
- err = -EINVAL;
+#ifdef CONFIG_MODULES
+ if (err == -EAGAIN)
+ if (nft_expr_type_request_module(ctx->net,
+ ctx->family,
+ tb[NFTA_EXPR_NAME]) != -EAGAIN)
+ err = -ENOENT;
+#endif
goto err1;
}
} else
ops = type->ops;
+ info->attr = nla;
info->ops = ops;
+
return 0;
err1:
@@ -2094,16 +3466,65 @@ err1:
return err;
}
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+ struct nft_expr_info *info)
+{
+ struct nlattr *tb[NFTA_EXPR_MAX + 1];
+ const struct nft_expr_type *type;
+ int err;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+ nft_expr_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME])
+ return -EINVAL;
+
+ rcu_read_lock();
+
+ type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ if (!type) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (!type->inner_ops) {
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+ tb[NFTA_EXPR_DATA],
+ type->policy, NULL);
+ if (err < 0)
+ goto out_unlock;
+
+ info->attr = nla;
+ info->ops = type->inner_ops;
+
+ /* No module reference will be taken on type->owner.
+ * Presence of type->inner_ops implies that the expression
+ * is builtin, so it cannot go away.
+ */
+ rcu_read_unlock();
+ return 0;
+
+out_unlock:
+ rcu_read_unlock();
+ return err;
+}
+
static int nf_tables_newexpr(const struct nft_ctx *ctx,
- const struct nft_expr_info *info,
+ const struct nft_expr_info *expr_info,
struct nft_expr *expr)
{
- const struct nft_expr_ops *ops = info->ops;
+ const struct nft_expr_ops *ops = expr_info->ops;
int err;
expr->ops = ops;
if (ops->init) {
- err = ops->init(ctx, expr, (const struct nlattr **)info->tb);
+ err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
if (err < 0)
goto err1;
}
@@ -2117,40 +3538,68 @@ err1:
static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
struct nft_expr *expr)
{
+ const struct nft_expr_type *type = expr->ops->type;
+
if (expr->ops->destroy)
expr->ops->destroy(ctx, expr);
- module_put(expr->ops->type->owner);
+ module_put(type->owner);
}
-struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
- const struct nlattr *nla)
+static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
{
- struct nft_expr_info info;
+ struct nft_expr_info expr_info;
struct nft_expr *expr;
+ struct module *owner;
int err;
- err = nf_tables_expr_parse(ctx, nla, &info);
+ err = nf_tables_expr_parse(ctx, nla, &expr_info);
if (err < 0)
- goto err1;
+ goto err_expr_parse;
+
+ err = -EOPNOTSUPP;
+ if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_expr_stateful;
err = -ENOMEM;
- expr = kzalloc(info.ops->size, GFP_KERNEL);
+ expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
if (expr == NULL)
- goto err2;
+ goto err_expr_stateful;
- err = nf_tables_newexpr(ctx, &info, expr);
+ err = nf_tables_newexpr(ctx, &expr_info, expr);
if (err < 0)
- goto err3;
+ goto err_expr_new;
return expr;
-err3:
+err_expr_new:
kfree(expr);
-err2:
- module_put(info.ops->type->owner);
-err1:
+err_expr_stateful:
+ owner = expr_info.ops->type->owner;
+ if (expr_info.ops->type->release_ops)
+ expr_info.ops->type->release_ops(expr_info.ops);
+
+ module_put(owner);
+err_expr_parse:
return ERR_PTR(err);
}
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
+{
+ int err;
+
+ if (WARN_ON_ONCE(!src->ops->clone))
+ return -EINVAL;
+
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src, gfp);
+ if (err < 0)
+ return err;
+
+ __module_get(src->ops->type->owner);
+
+ return 0;
+}
+
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
@@ -2161,13 +3610,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
* Rules
*/
-static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *__nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
u64 handle)
{
struct nft_rule *rule;
// FIXME: this sucks
- list_for_each_entry_rcu(rule, &chain->rules, list) {
+ list_for_each_entry_rcu(rule, &chain->rules, list,
+ lockdep_commit_lock_is_held(net)) {
if (handle == rule->handle)
return rule;
}
@@ -2175,13 +3626,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain,
return ERR_PTR(-ENOENT);
}
-static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain,
+static struct nft_rule *nft_rule_lookup(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
if (nla == NULL)
return ERR_PTR(-EINVAL);
- return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla)));
+ return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla)));
}
static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
@@ -2190,12 +3642,14 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
[NFTA_RULE_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
[NFTA_RULE_HANDLE] = { .type = NLA_U64 },
- [NFTA_RULE_EXPRESSIONS] = { .type = NLA_NESTED },
+ [NFTA_RULE_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
[NFTA_RULE_COMPAT] = { .type = NLA_NESTED },
[NFTA_RULE_POSITION] = { .type = NLA_U64 },
[NFTA_RULE_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_RULE_ID] = { .type = NLA_U32 },
+ [NFTA_RULE_POSITION_ID] = { .type = NLA_U32 },
+ [NFTA_RULE_CHAIN_ID] = { .type = NLA_U32 },
};
static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
@@ -2203,24 +3657,19 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u32 flags, int family,
const struct nft_table *table,
const struct nft_chain *chain,
- const struct nft_rule *rule)
+ const struct nft_rule *rule, u64 handle,
+ bool reset)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
const struct nft_expr *expr, *next;
struct nlattr *list;
- const struct nft_rule *prule;
u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
+ nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
@@ -2229,19 +3678,20 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
NFTA_RULE_PAD))
goto nla_put_failure;
- if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
- prule = list_prev_entry(rule, list);
- if (nla_put_be64(skb, NFTA_RULE_POSITION,
- cpu_to_be64(prule->handle),
+ if (event != NFT_MSG_DELRULE && handle) {
+ if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle),
NFTA_RULE_PAD))
goto nla_put_failure;
}
- list = nla_nest_start(skb, NFTA_RULE_EXPRESSIONS);
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_stats(chain, rule);
+
+ list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS);
if (list == NULL)
goto nla_put_failure;
nft_rule_for_each_expr(expr, next, rule) {
- if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, list);
@@ -2264,7 +3714,11 @@ nla_put_failure:
static void nf_tables_rule_notify(const struct nft_ctx *ctx,
const struct nft_rule *rule, int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
+ const struct nft_rule *prule;
struct sk_buff *skb;
+ u64 handle = 0;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -2275,24 +3729,48 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (event == NFT_MSG_NEWRULE &&
+ !list_is_first(&rule->list, &ctx->chain->rules) &&
+ !list_is_last(&rule->list, &ctx->chain->rules)) {
+ prule = list_prev_entry(rule, list);
+ handle = prule->handle;
+ }
+ if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE))
+ flags |= NLM_F_APPEND;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
- event, 0, ctx->family, ctx->table,
- ctx->chain, rule);
+ event, flags, ctx->family, ctx->table,
+ ctx->chain, rule, handle, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+static void audit_log_rule_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+ table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
struct nft_rule_dump_ctx {
+ unsigned int s_idx;
char *table;
char *chain;
+ bool reset;
};
static int __nf_tables_dump_rules(struct sk_buff *skb,
@@ -2301,56 +3779,71 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
const struct nft_table *table,
const struct nft_chain *chain)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
- unsigned int s_idx = cb->args[0];
- const struct nft_rule *rule;
+ const struct nft_rule *rule, *prule;
+ unsigned int entries = 0;
+ int ret = 0;
+ u64 handle;
+ prule = NULL;
list_for_each_entry_rcu(rule, &chain->rules, list) {
if (!nft_is_active(net, rule))
+ goto cont_skip;
+ if (*idx < ctx->s_idx)
goto cont;
- if (*idx < s_idx)
- goto cont;
- if (*idx > s_idx) {
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- }
+ if (prule)
+ handle = prule->handle;
+ else
+ handle = 0;
+
if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWRULE,
NLM_F_MULTI | NLM_F_APPEND,
table->family,
- table, chain, rule) < 0)
- return 1;
-
+ table, chain, rule, handle, ctx->reset) < 0) {
+ ret = 1;
+ break;
+ }
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
+ prule = rule;
+cont_skip:
(*idx)++;
}
- return 0;
+
+ if (ctx->reset && entries)
+ audit_log_rule_reset(table, cb->seq, entries);
+
+ return ret;
}
static int nf_tables_dump_rules(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
struct nft_table *table;
const struct nft_chain *chain;
unsigned int idx = 0;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
- if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
+ if (ctx->table && strcmp(ctx->table, table->name) != 0)
continue;
- if (ctx && ctx->table && ctx->chain) {
+ if (ctx->table && ctx->chain) {
struct rhlist_head *list, *tmp;
list = rhltable_lookup(&table->chains_ht, ctx->chain,
@@ -2369,129 +3862,198 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
}
list_for_each_entry_rcu(chain, &table->chains, list) {
- if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+ if (__nf_tables_dump_rules(skb, &idx,
+ cb, table, chain))
goto done;
}
- if (ctx && ctx->table)
+ if (ctx->table)
break;
}
done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_rules(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ /* Mutex is held is to prevent that two concurrent dump-and-reset calls
+ * do not underrun counters and quotas. The commit_mutex is used for
+ * the lack a better lock, this is not transaction path.
+ */
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_rules(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_rules_start(struct netlink_callback *cb)
{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_rule_dump_ctx *ctx = NULL;
- if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
- ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
- if (!ctx)
- return -ENOMEM;
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_RULE_TABLE]) {
- ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_ATOMIC);
- if (!ctx->table) {
- kfree(ctx);
- return -ENOMEM;
- }
- }
- if (nla[NFTA_RULE_CHAIN]) {
- ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_ATOMIC);
- if (!ctx->chain) {
- kfree(ctx->table);
- kfree(ctx);
- return -ENOMEM;
- }
+ if (nla[NFTA_RULE_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
+ }
+ if (nla[NFTA_RULE_CHAIN]) {
+ ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
+ if (!ctx->chain) {
+ kfree(ctx->table);
+ return -ENOMEM;
}
}
-
- cb->data = ctx;
return 0;
}
+static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
+{
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_rules_start(cb);
+}
+
static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
- struct nft_rule_dump_ctx *ctx = cb->data;
+ struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
- if (ctx) {
- kfree(ctx->table);
- kfree(ctx->chain);
- kfree(ctx);
- }
+ kfree(ctx->table);
+ kfree(ctx->chain);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_chain *chain;
const struct nft_rule *rule;
+ struct net *net = info->net;
struct nft_table *table;
struct sk_buff *skb2;
- int family = nfmsg->nfgen_family;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start= nf_tables_dump_rules_start,
- .dump = nf_tables_dump_rules,
- .done = nf_tables_dump_rules_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
- }
-
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ return ERR_CAST(chain);
}
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
- return PTR_ERR(rule);
+ return ERR_CAST(rule);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
- family, table, chain, rule);
- if (err < 0)
- goto err;
+ err = nf_tables_fill_rule_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+ family, table, chain, rule, 0, reset);
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return skb2;
+}
-err:
- kfree_skb(skb2);
- return err;
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dump_rules_start,
+ .dump = nf_tables_dump_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ skb2 = nf_tables_getrule_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
-static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+static int nf_tables_getrule_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start= nf_tables_dumpreset_rules_start,
+ .dump = nf_tables_dumpreset_rules,
+ .done = nf_tables_dump_rules_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getrule_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_RULE_TABLE]),
+ (char *)nla_data(nla[NFTA_RULE_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
+}
+
+void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
{
struct nft_expr *expr, *next;
@@ -2500,7 +4062,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
* is called on error from nf_tables_newrule().
*/
expr = nft_expr_first(rule);
- while (expr != nft_expr_last(rule) && expr->ops) {
+ while (nft_expr_more(rule, expr)) {
next = nft_expr_next(expr);
nf_tables_expr_destroy(ctx, expr);
expr = next;
@@ -2508,17 +4070,27 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
kfree(rule);
}
-static void nf_tables_rule_release(const struct nft_ctx *ctx,
- struct nft_rule *rule)
+/* can only be used if rule is no longer visible to dumps */
+static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
- nft_rule_expr_deactivate(ctx, rule);
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
+ nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
nf_tables_rule_destroy(ctx, rule);
}
+/** nft_chain_validate - loop detection and hook validation
+ *
+ * @ctx: context containing call depth and base chain
+ * @chain: chain to validate
+ *
+ * Walk through the rules of the given chain and chase all jumps/gotos
+ * and set lookups until either the jump limit is hit or all reachable
+ * chains have been validated.
+ */
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
struct nft_expr *expr, *last;
- const struct nft_data *data;
struct nft_rule *rule;
int err;
@@ -2526,6 +4098,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
return -EMLINK;
list_for_each_entry(rule, &chain->rules, list) {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
if (!nft_is_active_next(ctx->net, rule))
continue;
@@ -2533,7 +4108,10 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (!expr->ops->validate)
continue;
- err = expr->ops->validate(ctx, expr, &data);
+ /* This may call nft_chain_validate() recursively,
+ * callers that do so must increment ctx->level.
+ */
+ err = expr->ops->validate(ctx, expr);
if (err < 0)
return err;
}
@@ -2560,111 +4138,197 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
err = nft_chain_validate(&ctx, chain);
if (err < 0)
return err;
+
+ cond_resched();
}
return 0;
}
+int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
+ const struct nft_data *data;
+ int err;
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
+ return 0;
+
+ data = nft_set_ext_data(ext);
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ pctx->level++;
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ pctx->level--;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ struct nft_set_iter dummy_iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ };
+ struct nft_set_elem_catchall *catchall;
+
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, dummy_iter.genmask))
+ continue;
+
+ ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem);
+ if (ret < 0)
+ return ret;
+ }
+
+ return ret;
+}
+
+static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
+ const struct nlattr *nla);
+
#define NFT_RULE_MAXEXPRS 128
-static int nf_tables_newrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_expr_info *info = NULL;
- int family = nfmsg->nfgen_family;
- struct nft_table *table;
- struct nft_chain *chain;
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ unsigned int size, i, n, ulen = 0, usize = 0;
+ u8 genmask = nft_genmask_next(info->net);
struct nft_rule *rule, *old_rule = NULL;
+ struct nft_expr_info *expr_info = NULL;
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_flow_rule *flow = NULL;
+ struct net *net = info->net;
struct nft_userdata *udata;
- struct nft_trans *trans = NULL;
+ struct nft_table *table;
+ struct nft_chain *chain;
+ struct nft_trans *trans;
+ u64 handle, pos_handle;
struct nft_expr *expr;
struct nft_ctx ctx;
struct nlattr *tmp;
- unsigned int size, i, n, ulen = 0, usize = 0;
int err, rem;
- u64 handle, pos_handle;
- lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
}
- chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
- if (IS_ERR(chain)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
- return PTR_ERR(chain);
+ if (nla[NFTA_RULE_CHAIN]) {
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+ genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
+ return PTR_ERR(chain);
+ }
+
+ } else if (nla[NFTA_RULE_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
+ genmask);
+ if (IS_ERR(chain)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
+ return PTR_ERR(chain);
+ }
+ } else {
+ return -EINVAL;
}
+ if (nft_chain_is_bound(chain))
+ return -EOPNOTSUPP;
+
if (nla[NFTA_RULE_HANDLE]) {
handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
- rule = __nft_rule_lookup(chain, handle);
+ rule = __nft_rule_lookup(net, chain, handle);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
old_rule = rule;
else
return -EOPNOTSUPP;
} else {
- if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
- nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
+ info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);
- if (chain->use == UINT_MAX)
- return -EOVERFLOW;
-
if (nla[NFTA_RULE_POSITION]) {
pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
- old_rule = __nft_rule_lookup(chain, pos_handle);
+ old_rule = __nft_rule_lookup(net, chain, pos_handle);
if (IS_ERR(old_rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
return PTR_ERR(old_rule);
}
+ } else if (nla[NFTA_RULE_POSITION_ID]) {
+ old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
+ if (IS_ERR(old_rule)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
+ return PTR_ERR(old_rule);
+ }
}
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) {
- info = kvmalloc_array(NFT_RULE_MAXEXPRS,
- sizeof(struct nft_expr_info),
- GFP_KERNEL);
- if (!info)
+ expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+ sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (!expr_info)
return -ENOMEM;
nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
- goto err1;
+ goto err_release_expr;
if (n == NFT_RULE_MAXEXPRS)
- goto err1;
- err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
- if (err < 0)
- goto err1;
- size += info[n].ops->size;
+ goto err_release_expr;
+ err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, tmp);
+ goto err_release_expr;
+ }
+ size += expr_info[n].ops->size;
n++;
}
}
/* Check for overflow of dlen field */
err = -EFBIG;
if (size >= 1 << 12)
- goto err1;
+ goto err_release_expr;
if (nla[NFTA_RULE_USERDATA]) {
ulen = nla_len(nla[NFTA_RULE_USERDATA]);
@@ -2673,9 +4337,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
}
err = -ENOMEM;
- rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+ rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
if (rule == NULL)
- goto err1;
+ goto err_release_expr;
nft_activate_next(net, rule);
@@ -2691,37 +4355,56 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
expr = nft_expr_first(rule);
for (i = 0; i < n; i++) {
- err = nf_tables_newexpr(&ctx, &info[i], expr);
- if (err < 0)
- goto err2;
+ err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, expr_info[i].attr);
+ goto err_release_rule;
+ }
- if (info[i].ops->validate)
- nft_validate_state_update(net, NFT_VALIDATE_NEED);
+ if (expr_info[i].ops->validate)
+ nft_validate_state_update(table, NFT_VALIDATE_NEED);
- info[i].ops = NULL;
+ expr_info[i].ops = NULL;
expr = nft_expr_next(expr);
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
+ flow = nft_flow_rule_create(net, rule);
+ if (IS_ERR(flow)) {
+ err = PTR_ERR(flow);
+ goto err_release_rule;
+ }
+ }
+
+ if (!nft_use_inc(&chain->use)) {
+ err = -EMFILE;
+ goto err_release_rule;
+ }
+
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (nft_chain_binding(chain)) {
+ err = -EOPNOTSUPP;
+ goto err_destroy_flow_rule;
+ }
+
+ err = nft_delrule(&ctx, old_rule);
+ if (err < 0)
+ goto err_destroy_flow_rule;
+
trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
if (trans == NULL) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- err = nft_delrule(&ctx, old_rule);
- if (err < 0) {
- nft_trans_destroy(trans);
- goto err2;
- }
-
list_add_tail_rcu(&rule->list, &old_rule->list);
} else {
- if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) {
+ trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
+ if (!trans) {
err = -ENOMEM;
- goto err2;
+ goto err_destroy_flow_rule;
}
- if (nlh->nlmsg_flags & NLM_F_APPEND) {
+ if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
if (old_rule)
list_add_rcu(&rule->list, &old_rule->list);
else
@@ -2733,54 +4416,68 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_rcu(&rule->list, &chain->rules);
}
}
- kvfree(info);
- chain->use++;
+ kvfree(expr_info);
- if (net->nft.validate_state == NFT_VALIDATE_DO)
+ if (flow)
+ nft_trans_flow_rule(trans) = flow;
+
+ if (table->validate_state == NFT_VALIDATE_DO)
return nft_table_validate(net, table);
return 0;
-err2:
- nf_tables_rule_release(&ctx, rule);
-err1:
+
+err_destroy_flow_rule:
+ nft_use_dec_restore(&chain->use);
+ if (flow)
+ nft_flow_rule_destroy(flow);
+err_release_rule:
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
+ nf_tables_rule_destroy(&ctx, rule);
+err_release_expr:
for (i = 0; i < n; i++) {
- if (info[i].ops != NULL)
- module_put(info[i].ops->type->owner);
+ if (expr_info[i].ops) {
+ module_put(expr_info[i].ops->type->owner);
+ if (expr_info[i].ops->type->release_ops)
+ expr_info[i].ops->type->release_ops(expr_info[i].ops);
+ }
}
- kvfree(info);
+ kvfree(expr_info);
+
return err;
}
static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
+ const struct nft_chain *chain,
const struct nlattr *nla)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
struct nft_trans *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- struct nft_rule *rule = nft_trans_rule(trans);
-
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
if (trans->msg_type == NFT_MSG_NEWRULE &&
+ nft_trans_rule_chain(trans) == chain &&
id == nft_trans_rule_id(trans))
- return rule;
+ return nft_trans_rule(trans);
}
return ERR_PTR(-ENOENT);
}
-static int nf_tables_delrule(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- struct nft_table *table;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_chain *chain = NULL;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_rule *rule;
- int family = nfmsg->nfgen_family, err = 0;
struct nft_ctx ctx;
+ int err = 0;
- table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
return PTR_ERR(table);
@@ -2790,24 +4487,34 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
genmask);
if (IS_ERR(chain)) {
+ if (PTR_ERR(chain) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
}
+ if (nft_chain_binding(chain))
+ return -EOPNOTSUPP;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
- rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
+ rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]);
if (IS_ERR(rule)) {
+ if (PTR_ERR(rule) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
return PTR_ERR(rule);
}
err = nft_delrule(&ctx, rule);
} else if (nla[NFTA_RULE_ID]) {
- rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]);
+ rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
if (IS_ERR(rule)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
return PTR_ERR(rule);
@@ -2821,6 +4528,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_active_next(net, chain))
continue;
+ if (nft_chain_binding(chain))
+ continue;
ctx.chain = chain;
err = nft_delrule_by_chain(&ctx);
@@ -2835,25 +4544,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
/*
* Sets
*/
-
-static LIST_HEAD(nf_tables_set_types);
-
-int nft_register_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail_rcu(&type->list, &nf_tables_set_types);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_set);
-
-void nft_unregister_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_del_rcu(&type->list);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_set);
+static const struct nft_set_type *nft_set_types[] = {
+ &nft_set_hash_fast_type,
+ &nft_set_hash_type,
+ &nft_set_rhash_type,
+ &nft_set_bitmap_type,
+ &nft_set_rbtree_type,
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ &nft_set_pipapo_avx2_type,
+#endif
+ &nft_set_pipapo_type,
+};
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
@@ -2870,34 +4571,25 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
* given, in that case the amount of memory per element is used.
*/
static const struct nft_set_ops *
-nft_select_set_ops(const struct nft_ctx *ctx,
- const struct nlattr * const nla[],
- const struct nft_set_desc *desc,
- enum nft_set_policies policy)
+nft_select_set_ops(const struct nft_ctx *ctx, u32 flags,
+ const struct nft_set_desc *desc)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
const struct nft_set_ops *ops, *bops;
struct nft_set_estimate est, best;
const struct nft_set_type *type;
- u32 flags = 0;
+ int i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
-#ifdef CONFIG_MODULES
- if (list_empty(&nf_tables_set_types)) {
- nft_request_module(ctx->net, "nft-set");
- if (!list_empty(&nf_tables_set_types))
- return ERR_PTR(-EAGAIN);
- }
-#endif
- if (nla[NFTA_SET_FLAGS] != NULL)
- flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
bops = NULL;
best.size = ~0;
best.lookup = ~0;
best.space = ~0;
- list_for_each_entry(type, &nf_tables_set_types, list) {
+ for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
+ type = nft_set_types[i];
ops = &type->ops;
if (!nft_set_ops_candidate(type, flags))
@@ -2905,7 +4597,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
if (!ops->estimate(desc, flags, &est))
continue;
- switch (policy) {
+ switch (desc->policy) {
case NFT_SET_POL_PERFORMANCE:
if (est.lookup < best.lookup)
break;
@@ -2928,11 +4620,6 @@ nft_select_set_ops(const struct nft_ctx *ctx,
break;
}
- if (!try_module_get(type->owner))
- continue;
- if (bops != NULL)
- module_put(to_set_type(bops)->owner);
-
bops = ops;
best = est;
}
@@ -2962,37 +4649,23 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
+ [NFTA_SET_EXPR] = { .type = NLA_NESTED },
+ [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
+ [NFTA_SET_TYPE] = { .type = NLA_REJECT },
+ [NFTA_SET_COUNT] = { .type = NLA_REJECT },
+};
+
+static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
+ [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
+ [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
-{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table = NULL;
-
- if (nla[NFTA_SET_TABLE] != NULL) {
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
- return PTR_ERR(table);
- }
- }
-
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
- return 0;
-}
-
-static struct nft_set *nft_set_lookup(const struct nft_table *table,
+static struct nft_set *nft_set_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_set *set;
@@ -3000,7 +4673,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(set, &table->sets, list) {
+ list_for_each_entry_rcu(set, &table->sets, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, set->name) &&
nft_active_genmask(set, genmask))
return set;
@@ -3023,19 +4697,21 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
}
static struct nft_set *nft_set_lookup_byid(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
- struct nft_trans *trans;
+ struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
+ struct nft_trans_set *trans;
- list_for_each_entry(trans, &net->nft.commit_list, list) {
- if (trans->msg_type == NFT_MSG_NEWSET) {
- struct nft_set *set = nft_trans_set(trans);
+ /* its likely the id we need is at the tail, not at start */
+ list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) {
+ struct nft_set *set = trans->set;
- if (id == nft_trans_set_id(trans) &&
- nft_active_genmask(set, genmask))
- return set;
- }
+ if (id == trans->set_id &&
+ set->table == table &&
+ nft_active_genmask(set, genmask))
+ return set;
}
return ERR_PTR(-ENOENT);
}
@@ -3048,12 +4724,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
{
struct nft_set *set;
- set = nft_set_lookup(table, nla_set_name, genmask);
+ set = nft_set_lookup(net, table, nla_set_name, genmask);
if (IS_ERR(set)) {
if (!nla_set_id)
return set;
- set = nft_set_lookup_byid(net, nla_set_id, genmask);
+ set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
}
return set;
}
@@ -3072,6 +4748,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
+ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
+ return -EINVAL;
+
inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
if (inuse == NULL)
return -ENOMEM;
@@ -3079,7 +4758,7 @@ cont:
list_for_each_entry(i, &ctx->table->sets, list) {
int tmp;
- if (!nft_is_active_next(ctx->net, set))
+ if (!nft_is_active_next(ctx->net, i))
continue;
if (!sscanf(i->name, name, &tmp))
continue;
@@ -3098,7 +4777,7 @@ cont:
free_page((unsigned long)inuse);
}
- set->name = kasprintf(GFP_KERNEL, name, min + n);
+ set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
if (!set->name)
return -ENOMEM;
@@ -3107,13 +4786,14 @@ cont:
continue;
if (!strcmp(set->name, i->name)) {
kfree(set->name);
+ set->name = NULL;
return -ENFILE;
}
}
return 0;
}
-static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
+int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
u64 ms = be64_to_cpu(nla_get_be64(nla));
u64 max = (u64)(~((u64)0));
@@ -3123,37 +4803,89 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
return -ERANGE;
ms *= NSEC_PER_MSEC;
- *result = nsecs_to_jiffies64(ms);
+ *result = nsecs_to_jiffies64(ms) ? : !!ms;
return 0;
}
-static __be64 nf_jiffies64_to_msecs(u64 input)
+__be64 nf_jiffies64_to_msecs(u64 input)
{
- u64 ms = jiffies64_to_nsecs(input);
+ return cpu_to_be64(jiffies64_to_msecs(input));
+}
- return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+static int nf_tables_fill_set_concat(struct sk_buff *skb,
+ const struct nft_set *set)
+{
+ struct nlattr *concat, *field;
+ int i;
+
+ concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT);
+ if (!concat)
+ return -ENOMEM;
+
+ for (i = 0; i < set->field_count; i++) {
+ field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
+ if (!field)
+ return -ENOMEM;
+
+ if (nla_put_be32(skb, NFTA_SET_FIELD_LEN,
+ htonl(set->field_len[i])))
+ return -ENOMEM;
+
+ nla_nest_end(skb, field);
+ }
+
+ nla_nest_end(skb, concat);
+
+ return 0;
+}
+
+static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
+{
+ if (ops->usize)
+ return ops->usize(size);
+
+ return size;
+}
+
+static noinline_for_stack int
+nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set)
+{
+ unsigned int nelems;
+ char str[40];
+ int ret;
+
+ ret = snprintf(str, sizeof(str), "%ps", set->ops);
+
+ /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped
+ * to userspace purely for informational/debug purposes.
+ */
+ DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str));
+
+ if (nla_put_string(skb, NFTA_SET_TYPE, str))
+ return -EMSGSIZE;
+
+ nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems));
+ return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems));
}
static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
const struct nft_set *set, u16 event, u16 flags)
{
- struct nfgenmsg *nfmsg;
- struct nlmsghdr *nlh;
- struct nlattr *desc;
+ u64 timeout = READ_ONCE(set->timeout);
+ u32 gc_int = READ_ONCE(set->gc_int);
u32 portid = ctx->portid;
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
u32 seq = ctx->seq;
+ int i;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, ctx->family, NFNETLINK_V0,
+ nft_base_seq_be16(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
@@ -3161,6 +4893,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
NFTA_SET_PAD))
goto nla_put_failure;
+
+ if (event == NFT_MSG_DELSET ||
+ event == NFT_MSG_DESTROYSET) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (set->flags != 0)
if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
goto nla_put_failure;
@@ -3179,13 +4918,13 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
goto nla_put_failure;
- if (set->timeout &&
+ if (timeout &&
nla_put_be64(skb, NFTA_SET_TIMEOUT,
- nf_jiffies64_to_msecs(set->timeout),
+ nf_jiffies64_to_msecs(timeout),
NFTA_SET_PAD))
goto nla_put_failure;
- if (set->gc_int &&
- nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int)))
+ if (gc_int &&
+ nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int)))
goto nla_put_failure;
if (set->policy != NFT_SET_POL_PERFORMANCE) {
@@ -3193,16 +4932,45 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
}
- if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
+ if (set->udata &&
+ nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
goto nla_put_failure;
- desc = nla_nest_start(skb, NFTA_SET_DESC);
- if (desc == NULL)
+ nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
+ if (!nest)
goto nla_put_failure;
if (set->size &&
- nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE,
+ htonl(nft_set_userspace_size(set->ops, set->size))))
+ goto nla_put_failure;
+
+ if (set->field_count > 1 &&
+ nf_tables_fill_set_concat(skb, set))
goto nla_put_failure;
- nla_nest_end(skb, desc);
+
+ nla_nest_end(skb, nest);
+
+ if (nf_tables_fill_set_info(skb, set))
+ goto nla_put_failure;
+
+ if (set->num_exprs == 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
+ if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ } else if (set->num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ set->exprs[i], false) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -3216,8 +4984,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
const struct nft_set *set, int event,
gfp_t gfp_flags)
{
- struct sk_buff *skb;
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
u32 portid = ctx->portid;
+ struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report &&
@@ -3228,14 +4998,16 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
- err = nf_tables_fill_set(skb, ctx, set, event, 0);
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
+ err = nf_tables_fill_set(skb, ctx, set, event, flags);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report,
- gfp_flags);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -3248,14 +5020,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
struct net *net = sock_net(skb->sk);
struct nft_ctx *ctx = cb->data, ctx_set;
+ struct nftables_pernet *nft_net;
if (cb->args[1])
return skb->len;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
ctx->family != table->family)
continue;
@@ -3319,25 +5093,31 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct nft_table *table = NULL;
+ struct net *net = info->net;
const struct nft_set *set;
- struct nft_ctx ctx;
struct sk_buff *skb2;
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_ctx ctx;
int err;
- /* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ if (nla[NFTA_SET_TABLE]) {
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
+ }
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_sets_start,
.dump = nf_tables_dump_sets,
@@ -3346,18 +5126,20 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
.module = THIS_MODULE,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
/* Only accept unspec with dump */
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
if (!nla[NFTA_SET_TABLE])
return -EINVAL;
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb2 == NULL)
@@ -3365,53 +5147,200 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
if (err < 0)
- goto err;
+ goto err_fill_set_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
-err:
+err_fill_set_info:
kfree_skb(skb2);
return err;
}
-static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
- struct nft_set_desc *desc,
+static int nft_set_desc_concat_parse(const struct nlattr *attr,
+ struct nft_set_desc *desc)
+{
+ struct nlattr *tb[NFTA_SET_FIELD_MAX + 1];
+ u32 len;
+ int err;
+
+ if (desc->field_count >= ARRAY_SIZE(desc->field_len))
+ return -E2BIG;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
+ nft_concat_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_SET_FIELD_LEN])
+ return -EINVAL;
+
+ len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
+ if (!len || len > U8_MAX)
+ return -EINVAL;
+
+ desc->field_len[desc->field_count++] = len;
+
+ return 0;
+}
+
+static int nft_set_desc_concat(struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ u32 len = 0, num_regs;
+ struct nlattr *attr;
+ int rem, err, i;
+
+ nla_for_each_nested(attr, nla, rem) {
+ if (nla_type(attr) != NFTA_LIST_ELEM)
+ return -EINVAL;
+
+ err = nft_set_desc_concat_parse(attr, desc);
+ if (err < 0)
+ return err;
+ }
+
+ for (i = 0; i < desc->field_count; i++)
+ len += round_up(desc->field_len[i], sizeof(u32));
+
+ if (len != desc->klen)
+ return -EINVAL;
+
+ num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
+ if (num_regs > NFT_REG32_COUNT)
+ return -E2BIG;
+
+ return 0;
+}
+
+static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
const struct nlattr *nla)
{
struct nlattr *da[NFTA_SET_DESC_MAX + 1];
int err;
- err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla,
- nft_set_desc_policy, NULL);
+ err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla,
+ nft_set_desc_policy, NULL);
if (err < 0)
return err;
if (da[NFTA_SET_DESC_SIZE] != NULL)
desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+ if (da[NFTA_SET_DESC_CONCAT])
+ err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]);
+
+ return err;
+}
+
+static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set,
+ const struct nlattr * const *nla,
+ struct nft_expr **exprs, int *num_exprs,
+ u32 flags)
+{
+ struct nft_expr *expr;
+ int err, i;
+
+ if (nla[NFTA_SET_EXPR]) {
+ expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ exprs[0] = expr;
+ (*num_exprs)++;
+ } else if (nla[NFTA_SET_EXPRESSIONS]) {
+ struct nlattr *tmp;
+ int left;
+
+ if (!(flags & NFT_SET_EXPR)) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_set_expr_alloc;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_expr_alloc;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_expr_alloc;
+ }
+ exprs[i++] = expr;
+ (*num_exprs)++;
+ }
+ }
return 0;
+
+err_set_expr_alloc:
+ for (i = 0; i < *num_exprs; i++)
+ nft_expr_destroy(ctx, exprs[i]);
+
+ return err;
}
-static int nf_tables_newset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static bool nft_set_is_same(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ struct nft_expr *exprs[], u32 num_exprs, u32 flags)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ int i;
+
+ if (set->ktype != desc->ktype ||
+ set->dtype != desc->dtype ||
+ set->flags != flags ||
+ set->klen != desc->klen ||
+ set->dlen != desc->dlen ||
+ set->field_count != desc->field_count ||
+ set->num_exprs != num_exprs)
+ return false;
+
+ for (i = 0; i < desc->field_count; i++) {
+ if (set->field_len[i] != desc->field_len[i])
+ return false;
+ }
+
+ for (i = 0; i < num_exprs; i++) {
+ if (set->exprs[i]->ops != exprs[i]->ops)
+ return false;
+ }
+
+ return true;
+}
+
+static u32 nft_set_kernel_size(const struct nft_set_ops *ops,
+ const struct nft_set_desc *desc)
+{
+ if (ops->ksize)
+ return ops->ksize(desc->size);
+
+ return desc->size;
+}
+
+static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_set_ops *ops;
+ struct net *net = info->net;
+ struct nft_set_desc desc;
struct nft_table *table;
+ unsigned char *udata;
struct nft_set *set;
struct nft_ctx ctx;
+ size_t alloc_size;
+ int num_exprs = 0;
char *name;
- u64 size;
- u64 timeout;
- u32 ktype, dtype, flags, policy, gc_int, objtype;
- struct nft_set_desc desc;
- unsigned char *udata;
+ int err, i;
u16 udlen;
- int err;
+ u32 flags;
+ u64 size;
if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_NAME] == NULL ||
@@ -3421,10 +5350,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
memset(&desc, 0, sizeof(desc));
- ktype = NFT_DATA_VALUE;
+ desc.ktype = NFT_DATA_VALUE;
if (nla[NFTA_SET_KEY_TYPE] != NULL) {
- ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
- if ((ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
+ desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
+ if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
return -EINVAL;
}
@@ -3438,25 +5367,34 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
NFT_SET_MAP | NFT_SET_EVAL |
- NFT_SET_OBJECT))
- return -EINVAL;
+ NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
+ return -EOPNOTSUPP;
/* Only one of these operations is supported */
- if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
- (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
+ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
+ (NFT_SET_MAP | NFT_SET_OBJECT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+ (NFT_SET_EVAL | NFT_SET_OBJECT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) ==
+ (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT))
+ return -EOPNOTSUPP;
+ if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) ==
+ (NFT_SET_CONSTANT | NFT_SET_TIMEOUT))
return -EOPNOTSUPP;
}
- dtype = 0;
+ desc.dtype = 0;
if (nla[NFTA_SET_DATA_TYPE] != NULL) {
if (!(flags & NFT_SET_MAP))
return -EINVAL;
- dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
- if ((dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
- dtype != NFT_DATA_VERDICT)
+ desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
+ if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
+ desc.dtype != NFT_DATA_VERDICT)
return -EINVAL;
- if (dtype != NFT_DATA_VERDICT) {
+ if (desc.dtype != NFT_DATA_VERDICT) {
if (nla[NFTA_SET_DATA_LEN] == NULL)
return -EINVAL;
desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
@@ -3471,73 +5409,128 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
if (!(flags & NFT_SET_OBJECT))
return -EINVAL;
- objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
- if (objtype == NFT_OBJECT_UNSPEC ||
- objtype > NFT_OBJECT_MAX)
- return -EINVAL;
+ desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+ if (desc.objtype == NFT_OBJECT_UNSPEC ||
+ desc.objtype > NFT_OBJECT_MAX)
+ return -EOPNOTSUPP;
} else if (flags & NFT_SET_OBJECT)
return -EINVAL;
else
- objtype = NFT_OBJECT_UNSPEC;
+ desc.objtype = NFT_OBJECT_UNSPEC;
- timeout = 0;
+ desc.timeout = 0;
if (nla[NFTA_SET_TIMEOUT] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout);
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout);
if (err)
return err;
}
- gc_int = 0;
+ desc.gc_int = 0;
if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
if (!(flags & NFT_SET_TIMEOUT))
return -EINVAL;
- gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
+
+ if (flags & NFT_SET_ANONYMOUS)
+ return -EOPNOTSUPP;
+
+ desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
}
- policy = NFT_SET_POL_PERFORMANCE;
- if (nla[NFTA_SET_POLICY] != NULL)
- policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ desc.policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL) {
+ desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ switch (desc.policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ case NFT_SET_POL_MEMORY:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
if (nla[NFTA_SET_DESC] != NULL) {
- err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
if (err < 0)
return err;
+
+ if (desc.field_count > 1) {
+ if (!(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
+ }
+ } else if (flags & NFT_SET_CONCAT) {
+ return -EINVAL;
}
- table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
+ if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
+ desc.expr = true;
+
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
return PTR_ERR(table);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask);
+ set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set)) {
if (PTR_ERR(set) != -ENOENT) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return PTR_ERR(set);
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {};
+
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
return -EEXIST;
}
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return 0;
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
+ err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags);
+ if (err < 0)
+ return err;
+
+ if (desc.size)
+ desc.size = nft_set_kernel_size(set->ops, &desc);
+
+ err = 0;
+ if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
+ err = -EEXIST;
+ }
+
+ for (i = 0; i < num_exprs; i++)
+ nft_expr_destroy(&ctx, exprs[i]);
+
+ if (err < 0)
+ return err;
+
+ return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc);
}
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
- ops = nft_select_set_ops(&ctx, nla, &desc, policy);
+ ops = nft_select_set_ops(&ctx, flags, &desc);
if (IS_ERR(ops))
return PTR_ERR(ops);
+ if (desc.size)
+ desc.size = nft_set_kernel_size(ops, &desc);
+
udlen = 0;
if (nla[NFTA_SET_USERDATA])
udlen = nla_len(nla[NFTA_SET_USERDATA]);
@@ -3545,23 +5538,29 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
size = 0;
if (ops->privsize != NULL)
size = ops->privsize(nla, &desc);
+ alloc_size = sizeof(*set) + size + udlen;
+ if (alloc_size < size || alloc_size > INT_MAX)
+ return -ENOMEM;
+
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
- set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
+ set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!set) {
err = -ENOMEM;
- goto err1;
+ goto err_alloc;
}
- name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+ name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
if (!name) {
err = -ENOMEM;
- goto err2;
+ goto err_set_name;
}
err = nf_tables_set_alloc_name(&ctx, set, name);
kfree(name);
if (err < 0)
- goto err2;
+ goto err_set_name;
udata = NULL;
if (udlen) {
@@ -3570,103 +5569,159 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
+ INIT_LIST_HEAD(&set->catchall_list);
+ refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
- set->ops = ops;
- set->ktype = ktype;
- set->klen = desc.klen;
- set->dtype = dtype;
- set->objtype = objtype;
- set->dlen = desc.dlen;
+ set->ops = ops;
+ set->ktype = desc.ktype;
+ set->klen = desc.klen;
+ set->dtype = desc.dtype;
+ set->objtype = desc.objtype;
+ set->dlen = desc.dlen;
set->flags = flags;
- set->size = desc.size;
- set->policy = policy;
- set->udlen = udlen;
- set->udata = udata;
- set->timeout = timeout;
- set->gc_int = gc_int;
- set->handle = nf_tables_alloc_handle(table);
+ set->size = desc.size;
+ set->policy = desc.policy;
+ set->udlen = udlen;
+ set->udata = udata;
+ set->timeout = desc.timeout;
+ set->gc_int = desc.gc_int;
+
+ set->field_count = desc.field_count;
+ for (i = 0; i < desc.field_count; i++)
+ set->field_len[i] = desc.field_len[i];
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err3;
+ goto err_set_init;
+
+ err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags);
+ if (err < 0)
+ goto err_set_destroy;
+
+ set->num_exprs = num_exprs;
+ set->handle = nf_tables_alloc_handle(table);
+ INIT_LIST_HEAD(&set->pending_update);
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err4;
+ goto err_set_expr_alloc;
list_add_tail_rcu(&set->list, &table->sets);
- table->use++;
+
return 0;
-err4:
- ops->destroy(set);
-err3:
+err_set_expr_alloc:
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(&ctx, set->exprs[i]);
+err_set_destroy:
+ ops->destroy(&ctx, set);
+err_set_init:
kfree(set->name);
-err2:
+err_set_name:
kvfree(set);
-err1:
- module_put(to_set_type(ops)->owner);
+err_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
-static void nft_set_destroy(struct nft_set *set)
+static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
+ struct nft_set *set)
{
- set->ops->destroy(set);
- module_put(to_set_type(set->ops)->owner);
- kfree(set->name);
- kvfree(set);
+ struct nft_set_elem_catchall *next, *catchall;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ list_del_rcu(&catchall->list);
+ nf_tables_set_elem_destroy(ctx, set, catchall->elem);
+ kfree_rcu(catchall, rcu);
+ }
}
-static int nf_tables_delset(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static void nft_set_put(struct nft_set *set)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
+ if (refcount_dec_and_test(&set->refs)) {
+ kfree(set->name);
+ kvfree(set);
+ }
+}
+
+static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ int i;
+
+ if (WARN_ON(set->use > 0))
+ return;
+
+ for (i = 0; i < set->num_exprs; i++)
+ nft_expr_destroy(ctx, set->exprs[i]);
+
+ set->ops->destroy(ctx, set);
+ nft_set_catchall_destroy(ctx, set);
+ nft_set_put(set);
+}
+
+static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
- int err;
- if (nfmsg->nfgen_family == NFPROTO_UNSPEC)
+ if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
return -EAFNOSUPPORT;
- if (nla[NFTA_SET_TABLE] == NULL)
- return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
+ return PTR_ERR(table);
+ }
if (nla[NFTA_SET_HANDLE]) {
attr = nla[NFTA_SET_HANDLE];
- set = nft_set_lookup_byhandle(ctx.table, attr, genmask);
+ set = nft_set_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_SET_NAME];
- set = nft_set_lookup(ctx.table, attr, genmask);
+ set = nft_set_lookup(net, table, attr, genmask);
}
if (IS_ERR(set)) {
+ if (PTR_ERR(set) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(set);
}
- if (!list_empty(&set->bindings) ||
- (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) {
+ if (set->use ||
+ (info->nlh->nlmsg_flags & NLM_F_NONREC &&
+ atomic_read(&set->nelems) > 0)) {
NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
}
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
return nft_delset(&ctx, set);
}
-static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len);
+
+static int nft_setelem_data_validate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
enum nft_registers dreg;
dreg = nft_type_to_reg(set->dtype);
@@ -3676,11 +5731,50 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
set->dlen);
}
+static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ return nft_setelem_data_validate(ctx, set, elem_priv);
+}
+
+static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ ret = nft_setelem_data_validate(ctx, set, catchall->elem);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_binding *binding)
{
struct nft_set_binding *i;
- struct nft_set_iter iter;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nf_tables_bind_check_setelem,
+ };
if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
return -EBUSY;
@@ -3695,53 +5789,150 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
goto bind;
}
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_bind_check_setelem;
-
set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_bind_check(ctx, set);
+
if (iter.err < 0)
return iter.err;
}
bind:
+ if (!nft_use_inc(&set->use))
+ return -EMFILE;
+
binding->chain = ctx->chain;
list_add_tail_rcu(&binding->list, &set->bindings);
+ nft_set_trans_bind(ctx, set);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);
-void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding)
+static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding, bool event)
{
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set))
- list_add_tail_rcu(&set->list, &ctx->table->sets);
+ list_del_rcu(&binding->list);
- list_add_tail_rcu(&binding->list, &set->bindings);
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
+ list_del_rcu(&set->list);
+ set->dead = 1;
+ if (event)
+ nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
+ GFP_KERNEL);
+ }
}
-EXPORT_SYMBOL_GPL(nf_tables_rebind_set);
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_binding *binding)
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv);
+
+static int nft_mapelem_activate(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
{
- list_del_rcu(&binding->list);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set))
- list_del_rcu(&set->list);
+ /* called from abort path, reverse check to undo changes. */
+ if (nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, elem_priv);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
-void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
+static void nft_map_catchall_activate(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ nft_clear(ctx->net, ext);
+ nft_setelem_data_activate(ctx->net, set, catchall->elem);
+ break;
+ }
+}
+
+static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
{
- if (list_empty(&set->bindings) && nft_set_is_anonymous(set) &&
- nft_is_active(ctx->net, set)) {
- nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC);
- nft_set_destroy(set);
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_mapelem_activate,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ WARN_ON_ONCE(iter.err);
+
+ nft_map_catchall_activate(ctx, set);
+}
+
+void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(ctx, set);
+
+ nft_clear(ctx->net, set);
+ }
+
+ nft_use_inc_restore(&set->use);
+}
+EXPORT_SYMBOL_GPL(nf_tables_activate_set);
+
+void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_binding *binding,
+ enum nft_trans_phase phase)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net));
+
+ switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nft_set_trans_unbind(ctx, set);
+ if (nft_set_is_anonymous(set))
+ nft_deactivate_next(ctx->net, set);
+ else
+ list_del_rcu(&binding->list);
+
+ nft_use_dec(&set->use);
+ break;
+ case NFT_TRANS_PREPARE:
+ if (nft_set_is_anonymous(set)) {
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_deactivate_next(ctx->net, set);
+ }
+ nft_use_dec(&set->use);
+ return;
+ case NFT_TRANS_ABORT:
+ case NFT_TRANS_RELEASE:
+ if (nft_set_is_anonymous(set) &&
+ set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(ctx, set);
+
+ nft_use_dec(&set->use);
+ fallthrough;
+ default:
+ nf_tables_unbind_set(ctx, set, binding,
+ phase == NFT_TRANS_COMMIT);
}
}
+EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);
+
+void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
+{
+ if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
+ nft_set_destroy(ctx, set);
+}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
const struct nft_set_ext_type nft_set_ext_types[] = {
@@ -3751,8 +5942,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_DATA] = {
.align = __alignof__(u32),
},
- [NFT_SET_EXT_EXPR] = {
- .align = __alignof__(struct nft_expr),
+ [NFT_SET_EXT_EXPRESSIONS] = {
+ .align = __alignof__(struct nft_set_elem_expr),
},
[NFT_SET_EXT_OBJREF] = {
.len = sizeof(struct nft_object *),
@@ -3763,19 +5954,17 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u8),
},
[NFT_SET_EXT_TIMEOUT] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
- },
- [NFT_SET_EXT_EXPIRATION] = {
- .len = sizeof(u64),
- .align = __alignof__(u64),
+ .len = sizeof(struct nft_timeout),
+ .align = __alignof__(struct nft_timeout),
},
[NFT_SET_EXT_USERDATA] = {
.len = sizeof(struct nft_userdata),
.align = __alignof__(struct nft_userdata),
},
+ [NFT_SET_EXT_KEY_END] = {
+ .align = __alignof__(u32),
+ },
};
-EXPORT_SYMBOL_GPL(nft_set_ext_types);
/*
* Set elements
@@ -3786,10 +5975,14 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
[NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED },
[NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 },
+ [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 },
[NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED },
- [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING },
+ [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING,
+ .len = NFT_OBJ_MAXNAMELEN - 1 },
+ [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};
static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
@@ -3797,61 +5990,83 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_SET_ELEM_LIST_SET] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_SET_ELEM_LIST_ELEMENTS] = { .type = NLA_NESTED },
+ [NFTA_SET_ELEM_LIST_ELEMENTS] = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
- const struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack,
- u8 genmask)
+static int nft_set_elem_expr_dump(struct sk_buff *skb,
+ const struct nft_set *set,
+ const struct nft_set_ext *ext,
+ bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- int family = nfmsg->nfgen_family;
- struct nft_table *table;
+ struct nft_set_elem_expr *elem_expr;
+ u32 size, num_exprs = 0;
+ struct nft_expr *expr;
+ struct nlattr *nest;
- table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
- genmask);
- if (IS_ERR(table)) {
- NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
- return PTR_ERR(table);
- }
+ elem_expr = nft_set_ext_expr(ext);
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ num_exprs++;
+
+ if (num_exprs == 1) {
+ expr = nft_setelem_expr_at(elem_expr, 0);
+ if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0)
+ return -1;
+
+ return 0;
+ } else if (num_exprs > 1) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
- nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ expr = nft_setelem_expr_at(elem_expr, size);
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
return 0;
+
+nla_put_failure:
+ return -1;
}
static int nf_tables_fill_setelem(struct sk_buff *skb,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
- nest = nla_nest_start(skb, NFTA_LIST_ELEM);
+ nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
if (nest == NULL)
goto nla_put_failure;
- if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
+ NFT_DATA_VALUE, set->klen) < 0)
+ goto nla_put_failure;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext),
NFT_DATA_VALUE, set->klen) < 0)
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
- set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE,
- set->dlen) < 0)
+ nft_set_datatype(set), set->dlen) < 0)
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR) &&
- nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_set_elem_expr_dump(skb, set, ext, reset))
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
- (*nft_set_ext_obj(ext))->name) < 0)
+ (*nft_set_ext_obj(ext))->key.name) < 0)
goto nla_put_failure;
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
@@ -3859,25 +6074,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
htonl(*nft_set_ext_flags(ext))))
goto nla_put_failure;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
- nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
- nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout);
+ u64 set_timeout = READ_ONCE(set->timeout);
+ __be64 msecs = 0;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- u64 expires, now = get_jiffies_64();
+ if (set_timeout != timeout) {
+ msecs = nf_jiffies64_to_msecs(timeout);
+ if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs,
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
- expires = *nft_set_ext_expiration(ext);
- if (time_before64(now, expires))
- expires -= now;
- else
- expires = 0;
+ if (timeout > 0) {
+ u64 expires, now = get_jiffies_64();
- if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
- nf_jiffies64_to_msecs(expires),
- NFTA_SET_ELEM_PAD))
- goto nla_put_failure;
+ expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration);
+ if (time_before64(now, expires))
+ expires -= now;
+ else
+ expires = 0;
+
+ if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
+ nf_jiffies64_to_msecs(expires),
+ NFTA_SET_ELEM_PAD))
+ goto nla_put_failure;
+ }
}
if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
@@ -3901,40 +6123,97 @@ struct nft_set_dump_args {
const struct netlink_callback *cb;
struct nft_set_iter iter;
struct sk_buff *skb;
+ bool reset;
};
static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_set_dump_args *args;
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
+ return 0;
+
args = container_of(iter, struct nft_set_dump_args, iter);
- return nf_tables_fill_setelem(args->skb, set, elem);
+ return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
+}
+
+static void audit_log_nft_set_reset(const struct nft_table *table,
+ unsigned int base_seq,
+ unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
+ kfree(buf);
}
struct nft_set_dump_ctx {
const struct nft_set *set;
struct nft_ctx ctx;
+ bool reset;
};
+static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
+ const struct nft_set *set, bool reset,
+ unsigned int base_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
+ if (reset && !ret)
+ audit_log_nft_set_reset(set->table, base_seq, 1);
+ break;
+ }
+
+ return ret;
+}
+
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
struct net *net = sock_net(skb->sk);
+ struct nftables_pernet *nft_net;
struct nft_table *table;
struct nft_set *set;
- struct nft_set_dump_args args;
+ struct nft_set_dump_args args = {
+ .cb = cb,
+ .skb = skb,
+ .reset = dump_ctx->reset,
+ .iter = {
+ .genmask = nft_genmask_cur(net),
+ .type = NFT_ITER_READ,
+ .skip = cb->args[0],
+ .fn = nf_tables_dump_setelem,
+ },
+ };
bool set_found = false;
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
u32 portid, seq;
int event;
rcu_read_lock();
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
+
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
dump_ctx->ctx.family != table->family)
continue;
@@ -3960,38 +6239,30 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- NLM_F_MULTI);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
+ table->family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = table->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
goto nla_put_failure;
- nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
if (nest == NULL)
goto nla_put_failure;
- args.cb = cb;
- args.skb = skb;
- args.iter.genmask = nft_genmask_cur(net);
- args.iter.skip = cb->args[0];
- args.iter.count = 0;
- args.iter.err = 0;
- args.iter.fn = nf_tables_dump_setelem;
set->ops->walk(&dump_ctx->ctx, set, &args.iter);
- rcu_read_unlock();
+ if (!args.iter.err && args.iter.count == cb->args[0])
+ args.iter.err = nft_set_catchall_dump(net, skb, set,
+ dump_ctx->reset, cb->seq);
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
+ rcu_read_unlock();
+
if (args.iter.err && args.iter.err != -EMSGSIZE)
return args.iter.err;
if (args.iter.count == cb->args[0])
@@ -4005,6 +6276,26 @@ nla_put_failure:
return -ENOSPC;
}
+static int nf_tables_dumpreset_set(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
+ int ret, skip = cb->args[0];
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ ret = nf_tables_dump_set(skb, cb);
+
+ if (cb->args[0] > skip)
+ audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
+ cb->args[0] - skip);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_set_start(struct netlink_callback *cb)
{
struct nft_set_dump_ctx *dump_ctx = cb->data;
@@ -4024,34 +6315,29 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
const struct nft_ctx *ctx, u32 seq,
u32 portid, int event, u16 flags,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ const struct nft_elem_priv *elem_priv,
+ bool reset)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
int err;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
- flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
+ NFNETLINK_V0, nft_base_seq_be16(ctx->net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
goto nla_put_failure;
- nest = nla_nest_start(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
+ nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
if (nest == NULL)
goto nla_put_failure;
- err = nf_tables_fill_setelem(skb, set, elem);
+ err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
if (err < 0)
goto nla_put_failure;
@@ -4072,133 +6358,292 @@ static int nft_setelem_parse_flags(const struct nft_set *set,
return 0;
*flags = ntohl(nla_get_be32(attr));
- if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
+ if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EOPNOTSUPP;
if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
+ if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
+ (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set,
+ struct nft_data *key, struct nlattr *attr)
+{
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = NFT_DATA_VALUE_MAXLEN,
+ .len = set->klen,
+ };
+
+ return nft_data_init(ctx, key, &desc, attr);
+}
+
+static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_data_desc *desc,
+ struct nft_data *data,
+ struct nlattr *attr)
+{
+ u32 dtype;
+
+ if (set->dtype == NFT_DATA_VERDICT)
+ dtype = NFT_DATA_VERDICT;
+ else
+ dtype = NFT_DATA_VALUE;
+
+ desc->type = dtype;
+ desc->size = NFT_DATA_VALUE_MAXLEN;
+ desc->len = set->dlen;
+ desc->flags = NFT_DATA_DESC_SETELEM;
+
+ return nft_data_init(ctx, data, desc, attr);
+}
+
+static void *nft_setelem_catchall_get(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+ void *priv = NULL;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask) ||
+ nft_set_elem_expired(ext))
+ continue;
+
+ priv = catchall->elem;
+ break;
+ }
+
+ return priv;
+}
+
+static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ void *priv;
+
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ priv = set->ops->get(ctx->net, set, elem, flags);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+ } else {
+ priv = nft_setelem_catchall_get(ctx->net, set);
+ if (!priv)
+ return -ENOENT;
+ }
+ elem->priv = priv;
return 0;
}
-static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr)
+static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, bool reset)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
- struct nft_data_desc desc;
struct nft_set_elem elem;
struct sk_buff *skb;
uint32_t flags = 0;
- void *priv;
int err;
- err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
- nft_set_elem_policy, NULL);
+ err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy, NULL);
if (err < 0)
return err;
- if (!nla[NFTA_SET_ELEM_KEY])
- return -EINVAL;
-
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- return err;
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
- err = -EINVAL;
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
- return err;
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
+ }
- priv = set->ops->get(ctx->net, set, &elem, flags);
- if (IS_ERR(priv))
- return PTR_ERR(priv);
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ return err;
+ }
- elem.priv = priv;
+ err = nft_setelem_get(ctx, set, &elem, flags);
+ if (err < 0)
+ return err;
err = -ENOMEM;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (skb == NULL)
- goto err1;
+ return err;
err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
- NFT_MSG_NEWSETELEM, 0, set, &elem);
+ NFT_MSG_NEWSETELEM, 0, set, elem.priv,
+ reset);
if (err < 0)
- goto err2;
+ goto err_fill_setelem;
- err = nfnetlink_unicast(skb, ctx->net, ctx->portid, MSG_DONTWAIT);
- /* This avoids a loop in nfnetlink. */
- if (err < 0)
- goto err1;
+ return nfnetlink_unicast(skb, ctx->net, ctx->portid);
- return 0;
-err2:
+err_fill_setelem:
kfree_skb(skb);
-err1:
- /* this avoids a loop in nfnetlink. */
- return err == -EAGAIN ? -ENOBUFS : err;
+ return err;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx,
+ const struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[],
+ bool reset)
{
- u8 genmask = nft_genmask_cur(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
+ struct nft_table *table;
struct nft_set *set;
- struct nlattr *attr;
- struct nft_ctx ctx;
- int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ nft_ctx_init(&dump_ctx->ctx, net, skb,
+ info->nlh, family, table, NULL, nla);
+ dump_ctx->set = set;
+ dump_ctx->reset = reset;
+ return 0;
+}
+
+/* called with rcu_read_lock held */
+static int nf_tables_getsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ struct nlattr *attr;
+ int rem, err = 0;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_set_start,
.dump = nf_tables_dump_set,
.done = nf_tables_dump_set_done,
.module = THIS_MODULE,
};
- struct nft_set_dump_ctx dump_ctx = {
- .set = set,
- .ctx = ctx,
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ c.data = &dump_ctx;
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return -EINVAL;
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
+ if (err)
+ return err;
+
+ nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int nf_tables_getsetelem_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_set_dump_ctx dump_ctx;
+ int rem, err = 0, nelems = 0;
+ struct nlattr *attr;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_set_start,
+ .dump = nf_tables_dumpreset_set,
+ .done = nf_tables_dump_set_done,
+ .module = THIS_MODULE,
};
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ return err;
+
c.data = &dump_ctx;
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
return -EINVAL;
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ rcu_read_lock();
+
+ err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
+ if (err)
+ goto out_unlock;
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_get_set_elem(&ctx, set, attr);
- if (err < 0)
+ err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
break;
+ }
+ nelems++;
}
+ audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);
+
+out_unlock:
+ rcu_read_unlock();
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
return err;
}
static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
const struct nft_set *set,
- const struct nft_set_elem *elem,
- int event, u16 flags)
+ const struct nft_elem_priv *elem_priv,
+ int event)
{
+ struct nftables_pernet *nft_net;
struct net *net = ctx->net;
u32 portid = ctx->portid;
struct sk_buff *skb;
+ u16 flags = 0;
int err;
if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
@@ -4208,150 +6653,634 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
- set, elem);
+ set, elem_priv, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report,
- GFP_KERNEL);
+ nft_net = nft_pernet(net);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
-static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
+static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx,
int msg_type,
struct nft_set *set)
{
+ struct nft_trans_elem *te;
struct nft_trans *trans;
- trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem));
+ trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1));
if (trans == NULL)
return NULL;
- nft_trans_elem_set(trans) = set;
+ te = nft_trans_container_elem(trans);
+ te->nelems = 1;
+ te->set = set;
+
return trans;
}
-void *nft_set_elem_init(const struct nft_set *set,
- const struct nft_set_ext_tmpl *tmpl,
- const u32 *key, const u32 *data,
- u64 timeout, gfp_t gfp)
+struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_expr_init(ctx, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ err = -EOPNOTSUPP;
+ if (expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err_set_elem_expr;
+ if (!set->ops->gc_init)
+ goto err_set_elem_expr;
+ set->ops->gc_init(set);
+ }
+
+ return expr;
+
+err_set_elem_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
+static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
+{
+ len += nft_set_ext_types[id].len;
+ if (len > tmpl->ext_len[id] ||
+ len > U8_MAX)
+ return -1;
+
+ return 0;
+}
+
+static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
+ void *to, const void *from, u32 len)
+{
+ if (nft_set_ext_check(tmpl, id, len) < 0)
+ return -1;
+
+ memcpy(to, from, len);
+
+ return 0;
+}
+
+struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
+ const struct nft_set_ext_tmpl *tmpl,
+ const u32 *key, const u32 *key_end,
+ const u32 *data,
+ u64 timeout, u64 expiration, gfp_t gfp)
{
struct nft_set_ext *ext;
void *elem;
elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
if (elem == NULL)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ext = nft_set_elem_ext(set, elem);
nft_set_ext_init(ext, tmpl);
- memcpy(nft_set_ext_key(ext), key, set->klen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- memcpy(nft_set_ext_data(ext), data, set->dlen);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
- *nft_set_ext_expiration(ext) =
- get_jiffies_64() + timeout;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
- *nft_set_ext_timeout(ext) = timeout;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
+ nft_set_ext_key(ext), key, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
+ nft_set_ext_key_end(ext), key_end, set->klen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+ nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
+ nft_set_ext_data(ext), data, set->dlen) < 0)
+ goto err_ext_check;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
+ nft_set_ext_timeout(ext)->timeout = timeout;
+
+ if (expiration == 0)
+ expiration = timeout;
+
+ nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration;
+ }
return elem;
+
+err_ext_check:
+ kfree(elem);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(ctx, expr);
+ }
}
-void nft_set_elem_destroy(const struct nft_set *set, void *elem,
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_set_elem_expr *elem_expr)
+{
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size)
+ __nft_set_elem_expr_destroy(ctx, expr);
+}
+
+/* Drop references and destroy. Called from gc, dynset and abort path. */
+static void __nft_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
+ bool destroy_expr)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+
+ kfree(elem_priv);
+}
+
+/* Drop references and destroy. Called from gc and dynset. */
+void nft_set_elem_destroy(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv,
bool destroy_expr)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
struct nft_ctx ctx = {
.net = read_pnet(&set->net),
.family = set->table->family,
};
- nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(ext);
+ __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr);
+}
+EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
- if (expr->ops->destroy_clone) {
- expr->ops->destroy_clone(&ctx, expr);
- module_put(expr->ops->type->owner);
- } else {
- nf_tables_expr_destroy(&ctx, expr);
- }
+/* Drop references and destroy. Called from abort path. */
+static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ /* skip update request, see nft_trans_elems_new_abort() */
+ if (!te->elems[i].priv)
+ continue;
+
+ __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true);
}
- if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
- kfree(elem);
}
-EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
-/* Only called from commit path, nft_set_elem_deactivate() already deals with
- * the refcounting from the preparation phase.
+/* Destroy element. References have been already dropped in the preparation
+ * path via nft_setelem_data_deactivate().
*/
-static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
- const struct nft_set *set, void *elem)
+void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv)
{
- struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
- kfree(elem);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+
+ kfree(elem_priv);
+}
+
+static void nft_trans_elems_destroy(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++)
+ nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv);
+}
+
+int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_expr *expr_array[])
+{
+ struct nft_expr *expr;
+ int err, i, k;
+
+ for (i = 0; i < set->num_exprs; i++) {
+ expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
+ if (!expr)
+ goto err_expr;
+
+ err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
+ if (err < 0) {
+ kfree(expr);
+ goto err_expr;
+ }
+ expr_array[i] = expr;
+ }
+
+ return 0;
+
+err_expr:
+ for (k = i - 1; k >= 0; k--)
+ nft_expr_destroy(ctx, expr_array[k]);
+
+ return -ENOMEM;
+}
+
+static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
+ const struct nft_set_ext_tmpl *tmpl,
+ const struct nft_set_ext *ext,
+ struct nft_expr *expr_array[],
+ u32 num_exprs)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ u32 len = sizeof(struct nft_set_elem_expr);
+ struct nft_expr *expr;
+ int i, err;
+
+ if (num_exprs == 0)
+ return 0;
+
+ for (i = 0; i < num_exprs; i++)
+ len += expr_array[i]->ops->size;
+
+ if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
+ return -EINVAL;
+
+ for (i = 0; i < num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
+ if (err < 0)
+ goto err_elem_expr_setup;
+
+ elem_expr->size += expr_array[i]->ops->size;
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return 0;
+
+err_elem_expr_setup:
+ for (; i < num_exprs; i++) {
+ nft_expr_destroy(ctx, expr_array[i]);
+ expr_array[i] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
+ const struct nft_set *set)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_cur(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask) &&
+ !nft_set_elem_expired(ext) &&
+ !nft_set_elem_is_dead(ext))
+ return ext;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
+
+static int nft_setelem_catchall_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_elem_priv **priv)
+{
+ struct nft_set_elem_catchall *catchall;
+ u8 genmask = nft_genmask_next(net);
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (nft_set_elem_active(ext, genmask)) {
+ *priv = catchall->elem;
+ return -EEXIST;
+ }
+ }
+
+ catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT);
+ if (!catchall)
+ return -ENOMEM;
+
+ catchall->elem = elem->priv;
+ list_add_tail_rcu(&catchall->list, &set->catchall_list);
+
+ return 0;
+}
+
+static int nft_setelem_insert(const struct net *net,
+ struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_elem_priv **elem_priv,
+ unsigned int flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
+ else
+ ret = set->ops->insert(net, set, elem, elem_priv);
+
+ return ret;
+}
+
+static bool nft_setelem_is_catchall(const struct nft_set *set,
+ const struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
+ *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
+ return true;
+
+ return false;
+}
+
+static void nft_setelem_activate(struct net *net, struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+
+ if (nft_setelem_is_catchall(set, elem_priv)) {
+ nft_clear(net, ext);
+ } else {
+ set->ops->activate(net, set, elem_priv);
+ }
+}
+
+static void nft_trans_elem_update(const struct nft_set *set,
+ const struct nft_trans_one_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_elem_update *update = elem->update;
+
+ if (update->flags & NFT_TRANS_UPD_TIMEOUT)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout);
+
+ if (update->flags & NFT_TRANS_UPD_EXPIRATION)
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration);
+}
+
+static void nft_trans_elems_add(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ struct nft_trans_one_elem *elem = &te->elems[i];
+
+ if (elem->update)
+ nft_trans_elem_update(te->set, elem);
+ else
+ nft_setelem_activate(ctx->net, te->set, elem->priv);
+
+ nf_tables_setelem_notify(ctx, te->set, elem->priv,
+ NFT_MSG_NEWSETELEM);
+ kfree(elem->update);
+ }
+}
+
+static int nft_setelem_catchall_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_is_active_next(net, ext))
+ continue;
+
+ kfree(elem->priv);
+ elem->priv = catchall->elem;
+ nft_set_elem_change_active(net, set, ext);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int __nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ void *priv;
+
+ priv = set->ops->deactivate(net, set, elem);
+ if (!priv)
+ return -ENOENT;
+
+ kfree(elem->priv);
+ elem->priv = priv;
+ set->ndeact++;
+
+ return 0;
+}
+
+static int nft_setelem_deactivate(const struct net *net,
+ struct nft_set *set,
+ struct nft_set_elem *elem, u32 flags)
+{
+ int ret;
+
+ if (flags & NFT_SET_ELEM_CATCHALL)
+ ret = nft_setelem_catchall_deactivate(net, set, elem);
+ else
+ ret = __nft_setelem_deactivate(net, set, elem);
+
+ return ret;
+}
+
+static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall)
+{
+ list_del_rcu(&catchall->list);
+ kfree_rcu(catchall, rcu);
+}
+
+static void nft_setelem_catchall_remove(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ if (catchall->elem == elem_priv) {
+ nft_setelem_catchall_destroy(catchall);
+ break;
+ }
+ }
+}
+
+static void nft_setelem_remove(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ if (nft_setelem_is_catchall(set, elem_priv))
+ nft_setelem_catchall_remove(net, set, elem_priv);
+ else
+ set->ops->remove(net, set, elem_priv);
+}
+
+static void nft_trans_elems_remove(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ WARN_ON_ONCE(te->elems[i].update);
+
+ nf_tables_setelem_notify(ctx, te->set,
+ te->elems[i].priv,
+ te->nft_trans.msg_type);
+
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) {
+ atomic_dec(&te->set->nelems);
+ te->set->ndeact--;
+ }
+ }
+}
+
+static bool nft_setelem_valid_key_end(const struct nft_set *set,
+ struct nlattr **nla, u32 flags)
+{
+ if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
+ (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ return false;
+
+ if (nla[NFTA_SET_ELEM_KEY_END] &&
+ flags & NFT_SET_ELEM_CATCHALL)
+ return false;
+ } else {
+ if (nla[NFTA_SET_ELEM_KEY_END])
+ return false;
+ }
+
+ return true;
+}
+
+static u32 nft_set_maxsize(const struct nft_set *set)
+{
+ u32 maxsize, delta;
+
+ if (!set->size)
+ return UINT_MAX;
+
+ if (set->ops->adjust_maxsize)
+ delta = set->ops->adjust_maxsize(set);
+ else
+ delta = 0;
+
+ if (check_add_overflow(set->size, set->ndeact, &maxsize))
+ return UINT_MAX;
+
+ if (check_add_overflow(maxsize, delta, &maxsize))
+ return UINT_MAX;
+
+ return maxsize;
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
{
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
u8 genmask = nft_genmask_next(ctx->net);
- struct nft_data_desc d1, d2;
+ u32 flags = 0, size = 0, num_exprs = 0;
struct nft_set_ext_tmpl tmpl;
struct nft_set_ext *ext, *ext2;
struct nft_set_elem elem;
struct nft_set_binding *binding;
+ struct nft_elem_priv *elem_priv;
struct nft_object *obj = NULL;
struct nft_userdata *udata;
- struct nft_data data;
+ struct nft_data_desc desc;
enum nft_registers dreg;
struct nft_trans *trans;
- u32 flags = 0;
+ u64 expiration;
u64 timeout;
+ int err, i;
u8 ulen;
- int err;
- err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
- nft_set_elem_policy, NULL);
+ err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy, NULL);
if (err < 0)
return err;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
- return -EINVAL;
-
nft_set_ext_prepare(&tmpl);
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+
+ if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
+ (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
+ return -EINVAL;
+
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
!(flags & NFT_SET_ELEM_INTERVAL_END))
return -EINVAL;
- if (nla[NFTA_SET_ELEM_DATA] != NULL &&
- flags & NFT_SET_ELEM_INTERVAL_END)
- return -EINVAL;
} else {
if (nla[NFTA_SET_ELEM_DATA] != NULL)
return -EINVAL;
}
+ if (set->flags & NFT_SET_OBJECT) {
+ if (!nla[NFTA_SET_ELEM_OBJREF] &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END))
+ return -EINVAL;
+ } else {
+ if (nla[NFTA_SET_ELEM_OBJREF])
+ return -EINVAL;
+ }
+
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
+ if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
+ (nla[NFTA_SET_ELEM_DATA] ||
+ nla[NFTA_SET_ELEM_OBJREF] ||
+ nla[NFTA_SET_ELEM_TIMEOUT] ||
+ nla[NFTA_SET_ELEM_EXPIRATION] ||
+ nla[NFTA_SET_ELEM_USERDATA] ||
+ nla[NFTA_SET_ELEM_EXPR] ||
+ nla[NFTA_SET_ELEM_KEY_END] ||
+ nla[NFTA_SET_ELEM_EXPRESSIONS]))
+ return -EINVAL;
+
timeout = 0;
if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
@@ -4360,48 +7289,152 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
&timeout);
if (err)
return err;
- } else if (set->flags & NFT_SET_TIMEOUT) {
+ } else if (set->flags & NFT_SET_TIMEOUT &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
timeout = set->timeout;
}
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- goto err1;
- err = -EINVAL;
- if (d1.type != NFT_DATA_VALUE || d1.len != set->klen)
- goto err2;
+ expiration = 0;
+ if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
+ if (!(set->flags & NFT_SET_TIMEOUT))
+ return -EINVAL;
+ if (timeout == 0)
+ return -EOPNOTSUPP;
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len);
- if (timeout > 0) {
- nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION);
- if (timeout != set->timeout)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
+ &expiration);
+ if (err)
+ return err;
+
+ if (expiration > timeout)
+ return -ERANGE;
}
- if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
- if (!(set->flags & NFT_SET_OBJECT)) {
- err = -EINVAL;
- goto err2;
+ if (nla[NFTA_SET_ELEM_EXPR]) {
+ struct nft_expr *expr;
+
+ if (set->num_exprs && set->num_exprs != 1)
+ return -EOPNOTSUPP;
+
+ expr = nft_set_elem_expr_alloc(ctx, set,
+ nla[NFTA_SET_ELEM_EXPR]);
+ if (IS_ERR(expr))
+ return PTR_ERR(expr);
+
+ expr_array[0] = expr;
+ num_exprs = 1;
+
+ if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
}
- obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+ } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
+ struct nft_expr *expr;
+ struct nlattr *tmp;
+ int left;
+
+ i = 0;
+ nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX ||
+ (set->num_exprs && set->num_exprs == i)) {
+ err = -E2BIG;
+ goto err_set_elem_expr;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_set_elem_expr;
+ }
+ expr = nft_set_elem_expr_alloc(ctx, set, tmp);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_elem_expr;
+ }
+ expr_array[i] = expr;
+ num_exprs++;
+
+ if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ i++;
+ }
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
+ goto err_set_elem_expr;
+ }
+ } else if (set->num_exprs > 0 &&
+ !(flags & NFT_SET_ELEM_INTERVAL_END)) {
+ err = nft_set_elem_expr_clone(ctx, set, expr_array);
+ if (err < 0)
+ goto err_set_elem_expr_clone;
+
+ num_exprs = set->num_exprs;
+ }
+
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ goto err_set_elem_expr;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto err_parse_key;
+ }
+
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ goto err_parse_key;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto err_parse_key_end;
+ }
+
+ if (set->flags & NFT_SET_TIMEOUT) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
+ if (err < 0)
+ goto err_parse_key_end;
+ }
+
+ if (num_exprs) {
+ for (i = 0; i < num_exprs; i++)
+ size += expr_array[i]->ops->size;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+ if (err < 0)
+ goto err_parse_key_end;
+ }
+
+ if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
+ obj = nft_obj_lookup(ctx->net, ctx->table,
+ nla[NFTA_SET_ELEM_OBJREF],
set->objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err2;
+ obj = NULL;
+ goto err_parse_key_end;
+ }
+
+ if (!nft_use_inc(&obj->use)) {
+ err = -EMFILE;
+ obj = NULL;
+ goto err_parse_key_end;
}
- nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+ if (err < 0)
+ goto err_parse_key_end;
}
if (nla[NFTA_SET_ELEM_DATA] != NULL) {
- err = nft_data_init(ctx, &data, sizeof(data), &d2,
- nla[NFTA_SET_ELEM_DATA]);
+ err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
+ nla[NFTA_SET_ELEM_DATA]);
if (err < 0)
- goto err2;
-
- err = -EINVAL;
- if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen)
- goto err3;
+ goto err_parse_key_end;
dreg = nft_type_to_reg(set->dtype);
list_for_each_entry(binding, &set->bindings, list) {
@@ -4416,19 +7449,21 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
continue;
err = nft_validate_register_store(&bind_ctx, dreg,
- &data,
- d2.type, d2.len);
+ &elem.data.val,
+ desc.type, desc.len);
if (err < 0)
- goto err3;
+ goto err_parse_data;
- if (d2.type == NFT_DATA_VERDICT &&
- (data.verdict.code == NFT_GOTO ||
- data.verdict.code == NFT_JUMP))
- nft_validate_state_update(ctx->net,
+ if (desc.type == NFT_DATA_VERDICT &&
+ (elem.data.val.verdict.code == NFT_GOTO ||
+ elem.data.val.verdict.code == NFT_JUMP))
+ nft_validate_state_update(ctx->table,
NFT_VALIDATE_NEED);
}
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len);
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
+ if (err < 0)
+ goto err_parse_data;
}
/* The full maximum length of userdata can exceed the maximum
@@ -4438,45 +7473,59 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
ulen = 0;
if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
- if (ulen > 0)
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
- ulen);
+ if (ulen > 0) {
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
+ ulen);
+ if (err < 0)
+ goto err_parse_data;
+ }
}
- err = -ENOMEM;
- elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data,
- timeout, GFP_KERNEL);
- if (elem.priv == NULL)
- goto err3;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
+ elem.key_end.val.data, elem.data.val.data,
+ timeout, expiration, GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
+ goto err_parse_data;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
+
+ if (obj)
+ *nft_set_ext_obj(ext) = obj;
+
if (ulen > 0) {
+ if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
+ err = -EINVAL;
+ goto err_elem_free;
+ }
udata = nft_set_ext_userdata(ext);
udata->len = ulen - 1;
nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
}
- if (obj) {
- *nft_set_ext_obj(ext) = obj;
- obj->use++;
- }
+ err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
+ if (err < 0)
+ goto err_elem_free;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
- if (trans == NULL)
- goto err4;
+ if (trans == NULL) {
+ err = -ENOMEM;
+ goto err_elem_free;
+ }
+
+ ext->genmask = nft_genmask_cur(ctx->net);
- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
- err = set->ops->insert(ctx->net, set, &elem, &ext2);
+ err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
if (err) {
if (err == -EEXIST) {
+ ext2 = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
- nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) {
- err = -EBUSY;
- goto err5;
- }
+ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
+ goto err_element_clash;
if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
memcmp(nft_set_ext_data(ext),
@@ -4484,47 +7533,96 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
(nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
*nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
- err = -EBUSY;
- else if (!(nlmsg_flags & NLM_F_EXCL))
+ goto err_element_clash;
+ else if (!(nlmsg_flags & NLM_F_EXCL)) {
err = 0;
+ if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) {
+ struct nft_elem_update update = { };
+
+ if (timeout != nft_set_ext_timeout(ext2)->timeout) {
+ update.timeout = timeout;
+ if (expiration == 0)
+ expiration = timeout;
+
+ update.flags |= NFT_TRANS_UPD_TIMEOUT;
+ }
+ if (expiration) {
+ update.expiration = expiration;
+ update.flags |= NFT_TRANS_UPD_EXPIRATION;
+ }
+
+ if (update.flags) {
+ struct nft_trans_one_elem *ue;
+
+ ue = &nft_trans_container_elem(trans)->elems[0];
+
+ ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL);
+ if (!ue->update) {
+ err = -ENOMEM;
+ goto err_element_clash;
+ }
+
+ ue->priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
+ goto err_elem_free;
+ }
+ }
+ }
+ } else if (err == -ENOTEMPTY) {
+ /* ENOTEMPTY reports overlapping between this element
+ * and an existing one.
+ */
+ err = -EEXIST;
}
- goto err5;
+ goto err_element_clash;
}
- if (set->size &&
- !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) {
- err = -ENFILE;
- goto err6;
+ if (!(flags & NFT_SET_ELEM_CATCHALL)) {
+ unsigned int max = nft_set_maxsize(set);
+
+ if (!atomic_add_unless(&set->nelems, 1, max)) {
+ err = -ENFILE;
+ goto err_set_full;
+ }
}
- nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
-err6:
- set->ops->remove(ctx->net, set, &elem);
-err5:
+err_set_full:
+ nft_setelem_remove(ctx->net, set, elem.priv);
+err_element_clash:
kfree(trans);
-err4:
- if (obj)
- obj->use--;
- kfree(elem.priv);
-err3:
+err_elem_free:
+ nf_tables_set_elem_destroy(ctx, set, elem.priv);
+err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_release(&data, d2.type);
-err2:
- nft_data_release(&elem.key.val, d1.type);
-err1:
+ nft_data_release(&elem.data.val, desc.type);
+err_parse_key_end:
+ if (obj)
+ nft_use_dec_restore(&obj->use);
+
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
+err_parse_key:
+ nft_data_release(&elem.key.val, NFT_DATA_VALUE);
+err_set_elem_expr:
+ for (i = 0; i < num_exprs && expr_array[i]; i++)
+ nft_expr_destroy(ctx, expr_array[i]);
+err_set_elem_expr_clone:
return err;
}
-static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- u8 genmask = nft_genmask_next(net);
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err;
@@ -4532,27 +7630,36 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+ set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
- if (IS_ERR(set))
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
+ }
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ if (!list_empty(&set->bindings) &&
+ (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
return -EBUSY;
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags);
- if (err < 0)
+ err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
return err;
+ }
}
- if (net->nft.validate_state == NFT_VALIDATE_DO)
- return nft_table_validate(net, ctx.table);
+ if (table->validate_state == NFT_VALIDATE_DO)
+ return nft_table_validate(net, table);
return 0;
}
@@ -4570,38 +7677,100 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
*/
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
+ struct nft_chain *chain;
+
if (type == NFT_DATA_VERDICT) {
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use++;
+ chain = data->verdict.chain;
+ nft_use_inc_restore(&chain->use);
break;
}
}
}
-static void nft_set_elem_activate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+static int nft_setelem_active_next(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
+ u8 genmask = nft_genmask_next(net);
+
+ return nft_set_elem_active(ext, genmask);
+}
+
+static void nft_setelem_data_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_hold(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use++;
+ nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
-static void nft_set_elem_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
- (*nft_set_ext_obj(ext))->use--;
+ nft_use_dec(&(*nft_set_ext_obj(ext))->use);
+}
+
+/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem.
+ * No notifications and no ndeact changes.
+ *
+ * Returns true if set had been added to (i.e., elements need to be removed again).
+ */
+static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx,
+ struct nft_trans_elem *te)
+{
+ bool removed = false;
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (te->elems[i].update) {
+ kfree(te->elems[i].update);
+ te->elems[i].update = NULL;
+ /* Update request, so do not release this element */
+ te->elems[i].priv = NULL;
+ continue;
+ }
+
+ if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ atomic_dec(&te->set->nelems);
+
+ removed = true;
+ }
+
+ return removed;
+}
+
+/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */
+static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
+ const struct nft_trans_elem *te)
+{
+ int i;
+
+ for (i = 0; i < te->nelems; i++) {
+ if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) {
+ nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv);
+ nft_setelem_activate(ctx->net, te->set, te->elems[i].priv);
+ }
+
+ if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ te->set->ndeact--;
+ }
}
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
@@ -4609,178 +7778,230 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_ext_tmpl tmpl;
- struct nft_data_desc desc;
struct nft_set_elem elem;
struct nft_set_ext *ext;
struct nft_trans *trans;
u32 flags = 0;
- void *priv;
int err;
- err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
- nft_set_elem_policy, NULL);
+ err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
+ nft_set_elem_policy, NULL);
if (err < 0)
- goto err1;
-
- err = -EINVAL;
- if (nla[NFTA_SET_ELEM_KEY] == NULL)
- goto err1;
-
- nft_set_ext_prepare(&tmpl);
+ return err;
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
- if (flags != 0)
- nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
- err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
- nla[NFTA_SET_ELEM_KEY]);
- if (err < 0)
- goto err1;
+ if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
+ return -EINVAL;
- err = -EINVAL;
- if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
- goto err2;
+ if (!nft_setelem_valid_key_end(set, nla, flags))
+ return -EINVAL;
+
+ nft_set_ext_prepare(&tmpl);
- nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
+ if (flags != 0) {
+ err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
+ if (err < 0)
+ return err;
+ }
+
+ if (nla[NFTA_SET_ELEM_KEY]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key.val,
+ nla[NFTA_SET_ELEM_KEY]);
+ if (err < 0)
+ return err;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
+ if (err < 0)
+ goto fail_elem;
+ }
+
+ if (nla[NFTA_SET_ELEM_KEY_END]) {
+ err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
+ nla[NFTA_SET_ELEM_KEY_END]);
+ if (err < 0)
+ goto fail_elem;
+
+ err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
+ if (err < 0)
+ goto fail_elem_key_end;
+ }
err = -ENOMEM;
- elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
- GFP_KERNEL);
- if (elem.priv == NULL)
- goto err2;
+ elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
+ elem.key_end.val.data, NULL, 0, 0,
+ GFP_KERNEL_ACCOUNT);
+ if (IS_ERR(elem.priv)) {
+ err = PTR_ERR(elem.priv);
+ goto fail_elem_key_end;
+ }
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
- if (trans == NULL) {
- err = -ENOMEM;
- goto err3;
- }
+ if (trans == NULL)
+ goto fail_trans;
- priv = set->ops->deactivate(ctx->net, set, &elem);
- if (priv == NULL) {
- err = -ENOENT;
- goto err4;
- }
- kfree(elem.priv);
- elem.priv = priv;
+ err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
+ if (err < 0)
+ goto fail_ops;
- nft_set_elem_deactivate(ctx->net, set, &elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem.priv);
- nft_trans_elem(trans) = elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
-err4:
+fail_ops:
kfree(trans);
-err3:
+fail_trans:
kfree(elem.priv);
-err2:
- nft_data_release(&elem.key.val, desc.type);
-err1:
+fail_elem_key_end:
+ nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
+fail_elem:
+ nft_data_release(&elem.key.val, NFT_DATA_VALUE);
return err;
}
-static int nft_flush_set(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
+static int nft_setelem_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ const struct nft_set_iter *iter,
+ struct nft_elem_priv *elem_priv)
{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_trans *trans;
- int err;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- sizeof(struct nft_trans_elem), GFP_ATOMIC);
+ if (!nft_set_elem_active(ext, iter->genmask))
+ return 0;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM,
+ struct_size_t(struct nft_trans_elem, elems, 1));
if (!trans)
return -ENOMEM;
- if (!set->ops->flush(ctx->net, set, elem->priv)) {
- err = -ENOENT;
- goto err1;
- }
+ set->ops->flush(ctx->net, set, elem_priv);
set->ndeact++;
- nft_set_elem_deactivate(ctx->net, set, elem);
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
nft_trans_elem_set(trans) = set;
- nft_trans_elem(trans) = *elem;
- list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+ nft_trans_container_elem(trans)->nelems = 1;
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
-err1:
- kfree(trans);
- return err;
}
-static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- u8 genmask = nft_genmask_next(net);
+ struct nft_trans *trans;
+
+ trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
+ if (!trans)
+ return -ENOMEM;
+
+ nft_setelem_data_deactivate(ctx->net, set, elem_priv);
+ nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
+ nft_trans_commit_list_add_elem(ctx->net, trans);
+
+ return 0;
+}
+
+static int nft_set_catchall_flush(const struct nft_ctx *ctx,
+ struct nft_set *set)
+{
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_set_elem_catchall *catchall;
+ struct nft_set_ext *ext;
+ int ret = 0;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list,
+ lockdep_commit_lock_is_held(ctx->net)) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+ if (!nft_set_elem_active(ext, genmask))
+ continue;
+
+ ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
+ if (ret < 0)
+ break;
+ nft_set_elem_change_active(ctx->net, set, ext);
+ }
+
+ return ret;
+}
+
+static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
+{
+ struct nft_set_iter iter = {
+ .genmask = genmask,
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_setelem_flush,
+ };
+
+ set->ops->walk(ctx, set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_flush(ctx, set);
+
+ return iter.err;
+}
+
+static int nf_tables_delsetelem(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
+ struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
int rem, err = 0;
- err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack,
- genmask);
- if (err < 0)
- return err;
+ table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+ genmask, NETLINK_CB(skb).portid);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+ return PTR_ERR(table);
+ }
- set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
- if (IS_ERR(set))
+ set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+ if (IS_ERR(set)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
return PTR_ERR(set);
- if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
+ }
+
+ if (nft_set_is_anonymous(set))
+ return -EOPNOTSUPP;
+
+ if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT))
return -EBUSY;
- if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) {
- struct nft_set_iter iter = {
- .genmask = genmask,
- .fn = nft_flush_set,
- };
- set->ops->walk(&ctx, set, &iter);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- return iter.err;
- }
+ if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+ return nft_set_flush(&ctx, set, genmask);
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
err = nft_del_setelem(&ctx, set, attr);
- if (err < 0)
- break;
+ if (err == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
+ continue;
- set->ndeact++;
+ if (err < 0) {
+ NL_SET_BAD_ATTR(extack, attr);
+ return err;
+ }
}
- return err;
-}
-
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
- struct nft_set_gc_batch *gcb;
- unsigned int i;
-
- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
- for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
- kfree(gcb);
-}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp)
-{
- struct nft_set_gc_batch *gcb;
- gcb = kzalloc(sizeof(*gcb), gfp);
- if (gcb == NULL)
- return gcb;
- gcb->head.set = set;
- return gcb;
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/*
* Stateful objects
@@ -4788,7 +8009,7 @@ EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/**
* nft_register_obj- register nf_tables stateful object type
- * @obj: object type
+ * @obj_type: object type
*
* Registers the object type for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
@@ -4807,7 +8028,7 @@ EXPORT_SYMBOL_GPL(nft_register_obj);
/**
* nft_unregister_obj - unregister nf_tables object type
- * @obj: object type
+ * @obj_type: object type
*
* Unregisters the object type for use with nf_tables.
*/
@@ -4819,18 +8040,36 @@ void nft_unregister_obj(struct nft_object_type *obj_type)
}
EXPORT_SYMBOL_GPL(nft_unregister_obj);
-struct nft_object *nft_obj_lookup(const struct nft_table *table,
+struct nft_object *nft_obj_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u32 objtype,
u8 genmask)
{
+ struct nft_object_hash_key k = { .table = table };
+ char search[NFT_OBJ_MAXNAMELEN];
+ struct rhlist_head *tmp, *list;
struct nft_object *obj;
- list_for_each_entry_rcu(obj, &table->objects, list) {
- if (!nla_strcmp(nla, obj->name) &&
- objtype == obj->ops->type->type &&
- nft_active_genmask(obj, genmask))
+ nla_strscpy(search, nla, sizeof(search));
+ k.name = search;
+
+ WARN_ON_ONCE(!rcu_read_lock_held() &&
+ !lockdep_commit_lock_is_held(net));
+
+ rcu_read_lock();
+ list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params);
+ if (!list)
+ goto out;
+
+ rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) {
+ if (objtype == obj->ops->type->type &&
+ nft_active_genmask(obj, genmask)) {
+ rcu_read_unlock();
return obj;
+ }
}
+out:
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(nft_obj_lookup);
@@ -4858,6 +8097,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
[NFTA_OBJ_HANDLE] = { .type = NLA_U64},
+ [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -4874,8 +8115,8 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
goto err1;
if (attr) {
- err = nla_parse_nested(tb, type->maxattr, attr, type->policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, type->maxattr, attr,
+ type->policy, NULL);
if (err < 0)
goto err2;
} else {
@@ -4893,7 +8134,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
}
err = -ENOMEM;
- obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+ obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
if (!obj)
goto err2;
@@ -4918,7 +8159,7 @@ static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
{
struct nlattr *nest;
- nest = nla_nest_start(skb, attr);
+ nest = nla_nest_start_noflag(skb, attr);
if (!nest)
goto nla_put_failure;
if (obj->ops->dump(skb, obj, reset) < 0)
@@ -4930,11 +8171,15 @@ nla_put_failure:
return -1;
}
-static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
{
const struct nft_object_type *type;
- list_for_each_entry(type, &nf_tables_objects, list) {
+ list_for_each_entry_rcu(type, &nf_tables_objects, list) {
+ if (type->family != NFPROTO_UNSPEC &&
+ type->family != family)
+ continue;
+
if (objtype == type->type)
return type;
}
@@ -4942,34 +8187,71 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
}
static const struct nft_object_type *
-nft_obj_type_get(struct net *net, u32 objtype)
+nft_obj_type_get(struct net *net, u32 objtype, u8 family)
{
const struct nft_object_type *type;
- type = __nft_obj_type_get(objtype);
- if (type != NULL && try_module_get(type->owner))
+ rcu_read_lock();
+ type = __nft_obj_type_get(objtype, family);
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nft-obj-%u", objtype);
- if (__nft_obj_type_get(objtype))
+ if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
return ERR_PTR(-ENOENT);
}
-static int nf_tables_newobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_updobj(const struct nft_ctx *ctx,
+ const struct nft_object_type *type,
+ const struct nlattr *attr,
+ struct nft_object *obj)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nft_object *newobj;
+ struct nft_trans *trans;
+ int err = -ENOMEM;
+
+ /* caller must have obtained type->owner reference. */
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
+ sizeof(struct nft_trans_obj));
+ if (!trans)
+ goto err_trans;
+
+ newobj = nft_obj_init(ctx, type, attr);
+ if (IS_ERR(newobj)) {
+ err = PTR_ERR(newobj);
+ goto err_free_trans;
+ }
+
+ nft_trans_obj(trans) = obj;
+ nft_trans_obj_update(trans) = true;
+ nft_trans_obj_newobj(trans) = newobj;
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_free_trans:
+ kfree(trans);
+err_trans:
+ module_put(type->owner);
+ return err;
+}
+
+static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_object_type *type;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct net *net = info->net;
struct nft_table *table;
struct nft_object *obj;
struct nft_ctx ctx;
@@ -4981,14 +8263,15 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
!nla[NFTA_OBJ_DATA])
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
}
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
if (err != -ENOENT) {
@@ -4996,48 +8279,88 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
return -EEXIST;
}
- return 0;
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (!obj->ops->update)
+ return 0;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (WARN_ON_ONCE(IS_ERR(type)))
+ return PTR_ERR(type);
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ /* type->owner reference is put when transaction object is released. */
+ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- type = nft_obj_type_get(net, objtype);
- if (IS_ERR(type))
- return PTR_ERR(type);
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ type = nft_obj_type_get(net, objtype, family);
+ if (IS_ERR(type)) {
+ err = PTR_ERR(type);
+ goto err_type;
+ }
obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
- goto err1;
+ goto err_init;
}
- obj->table = table;
+ obj->key.table = table;
obj->handle = nf_tables_alloc_handle(table);
- obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
- if (!obj->name) {
+ obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
+ if (!obj->key.name) {
err = -ENOMEM;
- goto err2;
+ goto err_strdup;
+ }
+
+ if (nla[NFTA_OBJ_USERDATA]) {
+ obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
+ if (obj->udata == NULL)
+ goto err_userdata;
+
+ obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
}
err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
if (err < 0)
- goto err3;
+ goto err_trans;
+
+ err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
+ nft_objname_ht_params);
+ if (err < 0)
+ goto err_obj_ht;
list_add_tail_rcu(&obj->list, &table->objects);
- table->use++;
+
return 0;
-err3:
- kfree(obj->name);
-err2:
+err_obj_ht:
+ /* queued in transaction log */
+ INIT_LIST_HEAD(&obj->list);
+ return err;
+err_trans:
+ kfree(obj->udata);
+err_userdata:
+ kfree(obj->key.name);
+err_strdup:
if (obj->ops->destroy)
obj->ops->destroy(&ctx, obj);
kfree(obj);
-err1:
+err_init:
module_put(type->owner);
+err_type:
+ nft_use_dec_restore(&table->use);
+
return err;
}
@@ -5046,28 +8369,35 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
int family, const struct nft_table *table,
struct nft_object *obj, bool reset)
{
- struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
- nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
+ nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
- nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
- nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
NFTA_OBJ_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELOBJ ||
+ event == NFT_MSG_DESTROYOBJ) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ goto nla_put_failure;
+
+ if (obj->udata &&
+ nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -5076,168 +8406,247 @@ nla_put_failure:
return -1;
}
-struct nft_obj_filter {
+static void audit_log_obj_reset(const struct nft_table *table,
+ unsigned int base_seq, unsigned int nentries)
+{
+ char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);
+
+ audit_log_nfcfg(buf, table->family, nentries,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+}
+
+struct nft_obj_dump_ctx {
+ unsigned int s_idx;
char *table;
u32 type;
+ bool reset;
};
static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
{
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
- const struct nft_table *table;
- unsigned int idx = 0, s_idx = cb->args[0];
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
+ struct nftables_pernet *nft_net;
+ const struct nft_table *table;
+ unsigned int entries = 0;
struct nft_object *obj;
- bool reset = false;
-
- if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ unsigned int idx = 0;
+ int rc = 0;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
+ entries = 0;
list_for_each_entry_rcu(obj, &table->objects, list) {
if (!nft_is_active(net, obj))
goto cont;
- if (idx < s_idx)
+ if (idx < ctx->s_idx)
goto cont;
- if (idx > s_idx)
- memset(&cb->args[1], 0,
- sizeof(cb->args) - sizeof(cb->args[0]));
- if (filter && filter->table &&
- strcmp(filter->table, table->name))
+ if (ctx->table && strcmp(ctx->table, table->name))
goto cont;
- if (filter &&
- filter->type != NFT_OBJECT_UNSPEC &&
- obj->ops->type->type != filter->type)
+ if (ctx->type != NFT_OBJECT_UNSPEC &&
+ obj->ops->type->type != ctx->type)
goto cont;
- if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NFT_MSG_NEWOBJ,
- NLM_F_MULTI | NLM_F_APPEND,
- table->family, table,
- obj, reset) < 0)
- goto done;
+ rc = nf_tables_fill_obj_info(skb, net,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NFT_MSG_NEWOBJ,
+ NLM_F_MULTI | NLM_F_APPEND,
+ table->family, table,
+ obj, ctx->reset);
+ if (rc < 0)
+ break;
+ entries++;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
idx++;
}
+ if (ctx->reset && entries)
+ audit_log_obj_reset(table, nft_base_seq(net), entries);
+ if (rc < 0)
+ break;
}
-done:
rcu_read_unlock();
- cb->args[0] = idx;
+ ctx->s_idx = idx;
return skb->len;
}
+static int nf_tables_dumpreset_obj(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+ int ret;
+
+ mutex_lock(&nft_net->commit_mutex);
+ ret = nf_tables_dump_obj(skb, cb);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return ret;
+}
+
static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
const struct nlattr * const *nla = cb->data;
- struct nft_obj_filter *filter = NULL;
-
- if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
- filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
- if (!filter)
- return -ENOMEM;
- if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
- if (!filter->table) {
- kfree(filter);
- return -ENOMEM;
- }
- }
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
- if (nla[NFTA_OBJ_TYPE])
- filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+ if (nla[NFTA_OBJ_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+ if (!ctx->table)
+ return -ENOMEM;
}
- cb->data = filter;
+ if (nla[NFTA_OBJ_TYPE])
+ ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
return 0;
}
+static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
+{
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
+
+ ctx->reset = true;
+
+ return nf_tables_dump_obj_start(cb);
+}
+
static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
- struct nft_obj_filter *filter = cb->data;
+ struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
- if (filter) {
- kfree(filter->table);
- kfree(filter);
- }
+ kfree(ctx->table);
return 0;
}
-/* called with rcu_read_lock held */
-static int nf_tables_getobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+/* Caller must hold rcu read lock or transaction mutex */
+static struct sk_buff *
+nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
+ const struct nlattr * const nla[], bool reset)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nft_table *table;
+ struct net *net = info->net;
struct nft_object *obj;
struct sk_buff *skb2;
- bool reset = false;
u32 objtype;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .start = nf_tables_dump_obj_start,
- .dump = nf_tables_dump_obj,
- .done = nf_tables_dump_obj_done,
- .module = THIS_MODULE,
- .data = (void *)nla,
- };
-
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
- }
-
if (!nla[NFTA_OBJ_NAME] ||
!nla[NFTA_OBJ_TYPE])
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
- return PTR_ERR(table);
+ return ERR_CAST(table);
}
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
if (IS_ERR(obj)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
- return PTR_ERR(obj);
+ return ERR_CAST(obj);
}
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
- return -ENOMEM;
-
- if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
- reset = true;
+ return ERR_PTR(-ENOMEM);
- err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+ err = nf_tables_fill_obj_info(skb2, net, portid,
+ info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
family, table, obj, reset);
- if (err < 0)
- goto err;
+ if (err < 0) {
+ kfree_skb(skb2);
+ return ERR_PTR(err);
+ }
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
- kfree_skb(skb2);
- return err;
+ return skb2;
+}
+
+static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ u32 portid = NETLINK_CB(skb).portid;
+ struct sk_buff *skb2;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dump_obj_start,
+ .dump = nf_tables_dump_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ skb2 = nf_tables_getobj_single(portid, info, nla, false);
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ return nfnetlink_unicast(skb2, info->net, portid);
+}
+
+static int nf_tables_getobj_reset(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct nftables_pernet *nft_net = nft_pernet(info->net);
+ u32 portid = NETLINK_CB(skb).portid;
+ struct net *net = info->net;
+ struct sk_buff *skb2;
+ char *buf;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nf_tables_dumpreset_obj_start,
+ .dump = nf_tables_dumpreset_obj,
+ .done = nf_tables_dump_obj_done,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+ rcu_read_unlock();
+ mutex_lock(&nft_net->commit_mutex);
+ skb2 = nf_tables_getobj_single(portid, info, nla, true);
+ mutex_unlock(&nft_net->commit_mutex);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ if (IS_ERR(skb2))
+ return PTR_ERR(skb2);
+
+ buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
+ nla_len(nla[NFTA_OBJ_TABLE]),
+ (char *)nla_data(nla[NFTA_OBJ_TABLE]),
+ nft_base_seq(net));
+ audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
+ AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+ kfree(buf);
+
+ return nfnetlink_unicast(skb2, net, portid);
}
static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
@@ -5246,18 +8655,18 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
obj->ops->destroy(ctx, obj);
module_put(obj->ops->type->owner);
- kfree(obj->name);
+ kfree(obj->key.name);
+ kfree(obj->udata);
kfree(obj);
}
-static int nf_tables_delobj(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_object *obj;
@@ -5268,7 +8677,8 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
(!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
- table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask);
+ table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
+ NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
return PTR_ERR(table);
@@ -5280,10 +8690,14 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
} else {
attr = nla[NFTA_OBJ_NAME];
- obj = nft_obj_lookup(table, attr, objtype, genmask);
+ obj = nft_obj_lookup(net, table, attr, objtype, genmask);
}
if (IS_ERR(obj)) {
+ if (PTR_ERR(obj) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(obj);
}
@@ -5292,15 +8706,17 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
return -EBUSY;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
return nft_delobj(&ctx, obj);
}
-void nft_obj_notify(struct net *net, struct nft_table *table,
- struct nft_object *obj, u32 portid, u32 seq, int event,
- int family, int report, gfp_t gfp)
+static void
+__nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct sk_buff *skb;
int err;
@@ -5312,25 +8728,47 @@ void nft_obj_notify(struct net *net, struct nft_table *table,
if (skb == NULL)
goto err;
- err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
- table, obj, false);
+ err = nf_tables_fill_obj_info(skb, net, portid, seq, event,
+ flags & (NLM_F_CREATE | NLM_F_EXCL),
+ family, table, obj, false);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
+ nft_notify_enqueue(skb, report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
+
+void nft_obj_notify(struct net *net, const struct nft_table *table,
+ struct nft_object *obj, u32 portid, u32 seq, int event,
+ u16 flags, int family, int report, gfp_t gfp)
+{
+ char *buf = kasprintf(gfp, "%s:%u",
+ table->name, nft_base_seq(net));
+
+ audit_log_nfcfg(buf,
+ family,
+ obj->handle,
+ event == NFT_MSG_NEWOBJ ?
+ AUDIT_NFT_OP_OBJ_REGISTER :
+ AUDIT_NFT_OP_OBJ_UNREGISTER,
+ gfp);
+ kfree(buf);
+
+ __nft_obj_notify(net, table, obj, portid, seq, event,
+ flags, family, report, gfp);
+}
EXPORT_SYMBOL_GPL(nft_obj_notify);
static void nf_tables_obj_notify(const struct nft_ctx *ctx,
struct nft_object *obj, int event)
{
- nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event,
- ctx->family, ctx->report, GFP_KERNEL);
+ __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+ ctx->seq, event, ctx->flags, ctx->family,
+ ctx->report, GFP_KERNEL);
}
/*
@@ -5359,14 +8797,17 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
.len = NFT_NAME_MAXLEN - 1 },
[NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED },
[NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
+ [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 },
};
-struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
+struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
+ const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
struct nft_flowtable *flowtable;
- list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list,
+ lockdep_commit_lock_is_held(net)) {
if (!nla_strcmp(nla, flowtable->name) &&
nft_active_genmask(flowtable, genmask))
return flowtable;
@@ -5375,6 +8816,23 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
}
EXPORT_SYMBOL_GPL(nft_flowtable_lookup);
+void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
+ struct nft_flowtable *flowtable,
+ enum nft_trans_phase phase)
+{
+ switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ case NFT_TRANS_PREPARE:
+ case NFT_TRANS_ABORT:
+ case NFT_TRANS_RELEASE:
+ nft_use_dec(&flowtable->use);
+ fallthrough;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable);
+
static struct nft_flowtable *
nft_flowtable_lookup_byhandle(const struct nft_table *table,
const struct nlattr *nla, u8 genmask)
@@ -5389,42 +8847,11 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table,
return ERR_PTR(-ENOENT);
}
-static int nf_tables_parse_devices(const struct nft_ctx *ctx,
- const struct nlattr *attr,
- struct net_device *dev_array[], int *len)
-{
- const struct nlattr *tmp;
- struct net_device *dev;
- char ifname[IFNAMSIZ];
- int rem, n = 0, err;
-
- nla_for_each_nested(tmp, attr, rem) {
- if (nla_type(tmp) != NFTA_DEVICE_NAME) {
- err = -EINVAL;
- goto err1;
- }
-
- nla_strlcpy(ifname, tmp, IFNAMSIZ);
- dev = __dev_get_by_name(ctx->net, ifname);
- if (!dev) {
- err = -ENOENT;
- goto err1;
- }
-
- dev_array[n++] = dev;
- if (n == NFT_FLOWTABLE_DEVICE_MAX) {
- err = -EFBIG;
- goto err1;
- }
- }
- if (!len)
- return -EINVAL;
-
- err = 0;
-err1:
- *len = n;
- return err;
-}
+struct nft_flowtable_hook {
+ u32 num;
+ int priority;
+ struct list_head list;
+};
static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
[NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 },
@@ -5432,63 +8859,87 @@ static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX
[NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED },
};
-static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
- const struct nlattr *attr,
- struct nft_flowtable *flowtable)
+static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
+ const struct nlattr * const nla[],
+ struct nft_flowtable_hook *flowtable_hook,
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack, bool add)
{
- struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
struct nf_hook_ops *ops;
+ struct nft_hook *hook;
int hooknum, priority;
- int err, n = 0, i;
+ int err;
- err = nla_parse_nested(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
- nft_flowtable_hook_policy, NULL);
+ INIT_LIST_HEAD(&flowtable_hook->list);
+
+ err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX,
+ nla[NFTA_FLOWTABLE_HOOK],
+ nft_flowtable_hook_policy, NULL);
if (err < 0)
return err;
- if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
- !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] ||
- !tb[NFTA_FLOWTABLE_HOOK_DEVS])
- return -EINVAL;
+ if (add) {
+ if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
+ !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
+ return -ENOENT;
+ }
- hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
- if (hooknum != NF_NETDEV_INGRESS)
- return -EINVAL;
+ hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+ if (hooknum != NF_NETDEV_INGRESS)
+ return -EOPNOTSUPP;
- priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+ priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
- err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
- dev_array, &n);
- if (err < 0)
- return err;
+ flowtable_hook->priority = priority;
+ flowtable_hook->num = hooknum;
+ } else {
+ if (tb[NFTA_FLOWTABLE_HOOK_NUM]) {
+ hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
+ if (hooknum != flowtable->hooknum)
+ return -EOPNOTSUPP;
+ }
- ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
+ if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
+ priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
+ if (priority != flowtable->data.priority)
+ return -EOPNOTSUPP;
+ }
- flowtable->hooknum = hooknum;
- flowtable->priority = priority;
- flowtable->ops = ops;
- flowtable->ops_len = n;
+ flowtable_hook->priority = flowtable->data.priority;
+ flowtable_hook->num = flowtable->hooknum;
+ }
- for (i = 0; i < n; i++) {
- flowtable->ops[i].pf = NFPROTO_NETDEV;
- flowtable->ops[i].hooknum = hooknum;
- flowtable->ops[i].priority = priority;
- flowtable->ops[i].priv = &flowtable->data;
- flowtable->ops[i].hook = flowtable->data.type->hook;
- flowtable->ops[i].dev = dev_array[i];
+ if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
+ err = nf_tables_parse_netdev_hooks(ctx->net,
+ tb[NFTA_FLOWTABLE_HOOK_DEVS],
+ &flowtable_hook->list,
+ extack);
+ if (err < 0)
+ return err;
+ }
+
+ list_for_each_entry(hook, &flowtable_hook->list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable_hook->num;
+ ops->priority = flowtable_hook->priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ }
}
return err;
}
+/* call under rcu_read_lock */
static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
{
const struct nf_flowtable_type *type;
- list_for_each_entry(type, &nf_tables_flowtables, list) {
+ list_for_each_entry_rcu(type, &nf_tables_flowtables, list) {
if (family == type->family)
return type;
}
@@ -5500,48 +8951,244 @@ nft_flowtable_type_get(struct net *net, u8 family)
{
const struct nf_flowtable_type *type;
+ rcu_read_lock();
type = __nft_flowtable_type_get(family);
- if (type != NULL && try_module_get(type->owner))
+ if (type != NULL && try_module_get(type->owner)) {
+ rcu_read_unlock();
return type;
+ }
+ rcu_read_unlock();
lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nft_request_module(net, "nf-flowtable-%u", family);
- if (__nft_flowtable_type_get(family))
+ if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN)
return ERR_PTR(-EAGAIN);
}
#endif
return ERR_PTR(-ENOENT);
}
+/* Only called from error and netdev event paths. */
+static void nft_unregister_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
+{
+ nf_unregister_net_hook(net, ops);
+ flowtable->data.type->setup(&flowtable->data, ops->dev,
+ FLOW_BLOCK_UNBIND);
+}
+
+static void __nft_unregister_flowtable_net_hooks(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct list_head *hook_list,
+ bool release_netdev)
+{
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ if (release_netdev) {
+ list_del(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ }
+}
+
static void nft_unregister_flowtable_net_hooks(struct net *net,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct list_head *hook_list)
{
- int i;
+ __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
+}
+
+static int nft_register_flowtable_ops(struct net *net,
+ struct nft_flowtable *flowtable,
+ struct nf_hook_ops *ops)
+{
+ int err;
+
+ err = flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_BIND);
+ if (err < 0)
+ return err;
+
+ err = nf_register_net_hook(net, ops);
+ if (!err)
+ return 0;
+
+ flowtable->data.type->setup(&flowtable->data,
+ ops->dev, FLOW_BLOCK_UNBIND);
+ return err;
+}
- for (i = 0; i < flowtable->ops_len; i++) {
- if (!flowtable->ops[i].dev)
+static int nft_register_flowtable_net_hooks(struct net *net,
+ struct nft_table *table,
+ struct list_head *hook_list,
+ struct nft_flowtable *flowtable)
+{
+ struct nft_hook *hook, *next;
+ struct nft_flowtable *ft;
+ struct nf_hook_ops *ops;
+ int err, i = 0;
+
+ list_for_each_entry(hook, hook_list, list) {
+ list_for_each_entry(ft, &table->flowtables, list) {
+ if (!nft_is_active_next(net, ft))
+ continue;
+
+ if (nft_hook_list_find(&ft->hook_list, hook)) {
+ err = -EEXIST;
+ goto err_unregister_net_hooks;
+ }
+ }
+
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ err = nft_register_flowtable_ops(net, flowtable, ops);
+ if (err < 0)
+ goto err_unregister_net_hooks;
+
+ i++;
+ }
+ }
+
+ return 0;
+
+err_unregister_net_hooks:
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
+
+ nft_unregister_flowtable_ops(net, flowtable, ops);
+ }
+ list_del_rcu(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+
+ return err;
+}
+
+static void nft_hooks_destroy(struct list_head *hook_list)
+{
+ struct nft_hook *hook, *next;
+
+ list_for_each_entry_safe(hook, next, hook_list, list) {
+ list_del_rcu(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+}
+
+static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_flowtable_hook flowtable_hook;
+ struct nftables_pernet *nft_net;
+ struct nft_hook *hook, *next;
+ struct nf_hook_ops *ops;
+ struct nft_trans *trans;
+ bool unregister = false;
+ u32 flags;
+ int err;
+
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+ if (nft_hook_list_find(&flowtable->hook_list, hook)) {
+ list_del(&hook->list);
+ nft_netdev_hook_free(hook);
continue;
+ }
+
+ nft_net = nft_pernet(ctx->net);
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->msg_type != NFT_MSG_NEWFLOWTABLE ||
+ trans->table != ctx->table ||
+ !nft_trans_flowtable_update(trans))
+ continue;
+
+ if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) {
+ err = -EEXIST;
+ goto err_flowtable_update_hook;
+ }
+ }
+ }
- nf_unregister_net_hook(net, &flowtable->ops[i]);
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+ (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
+ err = -EOPNOTSUPP;
+ goto err_flowtable_update_hook;
+ }
+ } else {
+ flags = flowtable->data.flags;
+ }
+
+ err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
+ &flowtable_hook.list, flowtable);
+ if (err < 0)
+ goto err_flowtable_update_hook;
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE,
+ sizeof(struct nft_trans_flowtable));
+ if (!trans) {
+ unregister = true;
+ err = -ENOMEM;
+ goto err_flowtable_update_hook;
+ }
+
+ nft_trans_flowtable_flags(trans) = flags;
+ nft_trans_flowtable(trans) = flowtable;
+ nft_trans_flowtable_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_flowtable_update_hook:
+ list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
+ if (unregister) {
+ list_for_each_entry(ops, &hook->ops_list, list)
+ nft_unregister_flowtable_ops(ctx->net,
+ flowtable, ops);
+ }
+ list_del_rcu(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
}
+
+ return err;
+
}
-static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_newflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct netlink_ext_ack *extack = info->extack;
+ struct nft_flowtable_hook flowtable_hook;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
const struct nf_flowtable_type *type;
- struct nft_flowtable *flowtable, *ft;
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nft_flowtable *flowtable;
+ struct net *net = info->net;
struct nft_table *table;
+ struct nft_trans *trans;
struct nft_ctx ctx;
- int err, i, k;
+ int err;
if (!nla[NFTA_FLOWTABLE_TABLE] ||
!nla[NFTA_FLOWTABLE_NAME] ||
@@ -5549,13 +9196,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
}
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
if (IS_ERR(flowtable)) {
err = PTR_ERR(flowtable);
@@ -5564,24 +9211,32 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
return err;
}
} else {
- if (nlh->nlmsg_flags & NLM_F_EXCL) {
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return -EEXIST;
}
- return 0;
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ return nft_flowtable_update(&ctx, info->nlh, flowtable, extack);
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
- flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
- if (!flowtable)
- return -ENOMEM;
+ if (!nft_use_inc(&table->use))
+ return -EMFILE;
+
+ flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
+ if (!flowtable) {
+ err = -ENOMEM;
+ goto flowtable_alloc;
+ }
flowtable->table = table;
flowtable->handle = nf_tables_alloc_handle(table);
+ INIT_LIST_HEAD(&flowtable->hook_list);
- flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+ flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
if (!flowtable->name) {
err = -ENOMEM;
goto err1;
@@ -5593,54 +9248,52 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
goto err2;
}
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flowtable->data.flags =
+ ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+ err = -EOPNOTSUPP;
+ goto err3;
+ }
+ }
+
+ write_pnet(&flowtable->data.net, net);
flowtable->data.type = type;
err = type->init(&flowtable->data);
if (err < 0)
goto err3;
- err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
- flowtable);
+ err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable,
+ extack, true);
if (err < 0)
- goto err4;
-
- for (i = 0; i < flowtable->ops_len; i++) {
- if (!flowtable->ops[i].dev)
- continue;
+ goto err_flowtable_parse_hooks;
- list_for_each_entry(ft, &table->flowtables, list) {
- for (k = 0; k < ft->ops_len; k++) {
- if (!ft->ops[k].dev)
- continue;
-
- if (flowtable->ops[i].dev == ft->ops[k].dev &&
- flowtable->ops[i].pf == ft->ops[k].pf) {
- err = -EBUSY;
- goto err5;
- }
- }
- }
+ list_splice(&flowtable_hook.list, &flowtable->hook_list);
+ flowtable->data.priority = flowtable_hook.priority;
+ flowtable->hooknum = flowtable_hook.num;
- err = nf_register_net_hook(net, &flowtable->ops[i]);
- if (err < 0)
- goto err5;
+ trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto err_flowtable_trans;
}
- err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
+ /* This must be LAST to ensure no packets are walking over this flowtable. */
+ err = nft_register_flowtable_net_hooks(ctx.net, table,
+ &flowtable->hook_list,
+ flowtable);
if (err < 0)
- goto err6;
+ goto err_flowtable_hooks;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
- table->use++;
return 0;
-err6:
- i = flowtable->ops_len;
-err5:
- for (k = i - 1; k >= 0; k--)
- nf_unregister_net_hook(net, &flowtable->ops[k]);
- kfree(flowtable->ops);
-err4:
+err_flowtable_hooks:
+ nft_trans_destroy(trans);
+err_flowtable_trans:
+ nft_hooks_destroy(&flowtable->hook_list);
+err_flowtable_parse_hooks:
flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
@@ -5648,19 +9301,80 @@ err2:
kfree(flowtable->name);
err1:
kfree(flowtable);
+flowtable_alloc:
+ nft_use_dec_restore(&table->use);
+
return err;
}
-static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook)
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_next(net);
- int family = nfmsg->nfgen_family;
+ struct nft_hook *this, *next;
+
+ list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
+ list_del(&this->list);
+ nft_netdev_hook_free(this);
+ }
+}
+
+static int nft_delflowtable_hook(struct nft_ctx *ctx,
+ struct nft_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
+{
+ const struct nlattr * const *nla = ctx->nla;
+ struct nft_flowtable_hook flowtable_hook;
+ LIST_HEAD(flowtable_del_list);
+ struct nft_hook *this, *hook;
+ struct nft_trans *trans;
+ int err;
+
+ err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
+ extack, false);
+ if (err < 0)
+ return err;
+
+ list_for_each_entry(this, &flowtable_hook.list, list) {
+ hook = nft_hook_list_find(&flowtable->hook_list, this);
+ if (!hook) {
+ err = -ENOENT;
+ goto err_flowtable_del_hook;
+ }
+ list_move(&hook->list, &flowtable_del_list);
+ }
+
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
+ sizeof(struct nft_trans_flowtable));
+ if (!trans) {
+ err = -ENOMEM;
+ goto err_flowtable_del_hook;
+ }
+
+ nft_trans_flowtable(trans) = flowtable;
+ nft_trans_flowtable_update(trans) = true;
+ INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
+ list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
+ nft_flowtable_hook_release(&flowtable_hook);
+
+ nft_trans_commit_list_add_tail(ctx->net, trans);
+
+ return 0;
+
+err_flowtable_del_hook:
+ list_splice(&flowtable_del_list, &flowtable->hook_list);
+ nft_flowtable_hook_release(&flowtable_hook);
+
+ return err;
+}
+
+static int nf_tables_delflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_next(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct net *net = info->net;
const struct nlattr *attr;
struct nft_table *table;
struct nft_ctx ctx;
@@ -5671,7 +9385,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
+ genmask, NETLINK_CB(skb).portid);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
@@ -5682,66 +9396,81 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk,
flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
} else {
attr = nla[NFTA_FLOWTABLE_NAME];
- flowtable = nft_flowtable_lookup(table, attr, genmask);
+ flowtable = nft_flowtable_lookup(net, table, attr, genmask);
}
if (IS_ERR(flowtable)) {
+ if (PTR_ERR(flowtable) == -ENOENT &&
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE)
+ return 0;
+
NL_SET_BAD_ATTR(extack, attr);
return PTR_ERR(flowtable);
}
+
+ nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+ if (nla[NFTA_FLOWTABLE_HOOK])
+ return nft_delflowtable_hook(&ctx, flowtable, extack);
+
if (flowtable->use > 0) {
NL_SET_BAD_ATTR(extack, attr);
return -EBUSY;
}
- nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
-
return nft_delflowtable(&ctx, flowtable);
}
static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq, int event,
u32 flags, int family,
- struct nft_flowtable *flowtable)
+ struct nft_flowtable *flowtable,
+ struct list_head *hook_list)
{
struct nlattr *nest, *nest_devs;
- struct nfgenmsg *nfmsg;
+ struct nft_hook *hook;
struct nlmsghdr *nlh;
- int i;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
NFTA_FLOWTABLE_PAD))
goto nla_put_failure;
- nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
+ if (!hook_list &&
+ (event == NFT_MSG_DELFLOWTABLE ||
+ event == NFT_MSG_DESTROYFLOWTABLE)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
+ nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
+ goto nla_put_failure;
+
+ nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK);
if (!nest)
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
+ nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority)))
goto nla_put_failure;
- nest_devs = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK_DEVS);
+ nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS);
if (!nest_devs)
goto nla_put_failure;
- for (i = 0; i < flowtable->ops_len; i++) {
- const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
+ if (!hook_list)
+ hook_list = &flowtable->hook_list;
- if (dev &&
- nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
+ list_for_each_entry_rcu(hook, hook_list, list,
+ lockdep_commit_lock_is_held(net)) {
+ if (nft_nla_put_hook_dev(skb, hook))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -5768,12 +9497,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
+ struct nftables_pernet *nft_net;
const struct nft_table *table;
rcu_read_lock();
- cb->seq = net->nft.base_seq;
+ nft_net = nft_pernet(net);
+ cb->seq = nft_base_seq(net);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
continue;
@@ -5793,7 +9524,8 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE,
NLM_F_MULTI | NLM_F_APPEND,
- table->family, flowtable) < 0)
+ table->family,
+ flowtable, NULL) < 0)
goto done;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -5844,21 +9576,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
}
/* called with rcu_read_lock held */
-static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getflowtable(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
- const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u8 genmask = nft_genmask_cur(net);
- int family = nfmsg->nfgen_family;
+ struct netlink_ext_ack *extack = info->extack;
+ u8 genmask = nft_genmask_cur(info->net);
+ u8 family = info->nfmsg->nfgen_family;
struct nft_flowtable *flowtable;
const struct nft_table *table;
+ struct net *net = info->net;
struct sk_buff *skb2;
int err;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.start = nf_tables_dump_flowtable_start,
.dump = nf_tables_dump_flowtable,
@@ -5867,47 +9598,54 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
.data = (void *)nla,
};
- return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
+ return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
}
if (!nla[NFTA_FLOWTABLE_NAME])
return -EINVAL;
table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
- genmask);
- if (IS_ERR(table))
+ genmask, 0);
+ if (IS_ERR(table)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
return PTR_ERR(table);
+ }
- flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
+ flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
genmask);
- if (IS_ERR(flowtable))
+ if (IS_ERR(flowtable)) {
+ NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
return PTR_ERR(flowtable);
+ }
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb2)
return -ENOMEM;
err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
+ info->nlh->nlmsg_seq,
NFT_MSG_NEWFLOWTABLE, 0, family,
- flowtable);
+ flowtable, NULL);
if (err < 0)
- goto err;
+ goto err_fill_flowtable_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+
+err_fill_flowtable_info:
kfree_skb(skb2);
return err;
}
static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
struct nft_flowtable *flowtable,
- int event)
+ struct list_head *hook_list, int event)
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct sk_buff *skb;
+ u16 flags = 0;
int err;
- if (ctx->report &&
+ if (!ctx->report &&
!nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
return;
@@ -5915,16 +9653,18 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
if (skb == NULL)
goto err;
+ if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
+ flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);
+
err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
- ctx->seq, event, 0,
- ctx->family, flowtable);
+ ctx->seq, event, flags,
+ ctx->family, flowtable, hook_list);
if (err < 0) {
kfree_skb(skb);
goto err;
}
- nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
- ctx->report, GFP_KERNEL);
+ nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
return;
err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
@@ -5932,9 +9672,14 @@ err:
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
- kfree(flowtable->ops);
- kfree(flowtable->name);
+ struct nft_hook *hook, *next;
+
flowtable->data.type->free(&flowtable->data);
+ list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ list_del_rcu(&hook->list);
+ nft_netdev_hook_free_rcu(hook);
+ }
+ kfree(flowtable->name);
module_put(flowtable->data.type->owner);
kfree(flowtable);
}
@@ -5943,20 +9688,15 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
char buf[TASK_COMM_LEN];
int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
+ NFNETLINK_V0, nft_base_seq_be16(net));
+ if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
-
- if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
goto nla_put_failure;
@@ -5969,42 +9709,132 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void nft_flowtable_event(unsigned long event, struct net_device *dev,
- struct nft_flowtable *flowtable)
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+ const struct net_device *dev)
{
- int i;
+ struct nf_hook_ops *ops;
- for (i = 0; i < flowtable->ops_len; i++) {
- if (flowtable->ops[i].dev != dev)
- continue;
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops);
- nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
- flowtable->ops[i].dev = NULL;
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+ const struct net_device *dev)
+{
+ struct nf_hook_ops *ops;
+
+ list_for_each_entry_rcu(ops, &hook->ops_list, list) {
+ if (ops->dev == dev)
+ return ops;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
+
+static int nft_flowtable_event(unsigned long event, struct net_device *dev,
+ struct nft_flowtable *flowtable, bool changename)
+{
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ bool match;
+
+ list_for_each_entry(hook, &flowtable->hook_list, list) {
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ /* flow_offload_netdev_event() cleans up entries for us. */
+ nft_unregister_flowtable_ops(dev_net(dev),
+ flowtable, ops);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kzalloc(sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = flowtable->hooknum;
+ ops->priority = flowtable->data.priority;
+ ops->priv = &flowtable->data;
+ ops->hook = flowtable->data.type->hook;
+ ops->hook_ops_type = NF_HOOK_OP_NFT_FT;
+ ops->dev = dev;
+ if (nft_register_flowtable_ops(dev_net(dev),
+ flowtable, ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
break;
}
+ return 0;
+}
+
+static int __nf_tables_flowtable_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
+{
+ struct nftables_pernet *nft_net = nft_pernet(dev_net(dev));
+ struct nft_flowtable *flowtable;
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ list_for_each_entry(flowtable, &table->flowtables, list) {
+ if (nft_flowtable_event(event, dev,
+ flowtable, changename))
+ return 1;
+ }
+ }
+ return 0;
}
static int nf_tables_flowtable_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct nft_flowtable *flowtable;
- struct nft_table *table;
+ struct nftables_pernet *nft_net;
+ int ret = NOTIFY_DONE;
struct net *net;
- if (event != NETDEV_UNREGISTER)
- return 0;
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
net = dev_net(dev);
- mutex_lock(&net->nft.commit_mutex);
- list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- nft_flowtable_event(event, dev, flowtable);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
}
+ __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_flowtable_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
}
- mutex_unlock(&net->nft.commit_mutex);
-
- return NOTIFY_DONE;
+out_unlock:
+ mutex_unlock(&nft_net->commit_mutex);
+ return ret;
}
static struct notifier_block nf_tables_flowtable_notifier = {
@@ -6018,7 +9848,7 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
struct sk_buff *skb2;
int err;
- if (nlmsg_report(nlh) &&
+ if (!nlmsg_report(nlh) &&
!nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
return;
@@ -6041,10 +9871,8 @@ err:
-ENOBUFS);
}
-static int nf_tables_getgen(struct net *net, struct sock *nlsk,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- struct netlink_ext_ack *extack)
+static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nla[])
{
struct sk_buff *skb2;
int err;
@@ -6053,128 +9881,206 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk,
if (skb2 == NULL)
return -ENOMEM;
- err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq);
+ err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq);
if (err < 0)
- goto err;
+ goto err_fill_gen_info;
- return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
-err:
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+
+err_fill_gen_info:
kfree_skb(skb2);
return err;
}
static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_NEWTABLE] = {
- .call_batch = nf_tables_newtable,
+ .call = nf_tables_newtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
- .call_rcu = nf_tables_gettable,
+ .call = nf_tables_gettable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_DELTABLE] = {
- .call_batch = nf_tables_deltable,
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_TABLE_MAX,
+ .policy = nft_table_policy,
+ },
+ [NFT_MSG_DESTROYTABLE] = {
+ .call = nf_tables_deltable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_NEWCHAIN] = {
- .call_batch = nf_tables_newchain,
+ .call = nf_tables_newchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
- .call_rcu = nf_tables_getchain,
+ .call = nf_tables_getchain,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_DELCHAIN] = {
- .call_batch = nf_tables_delchain,
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_CHAIN_MAX,
+ .policy = nft_chain_policy,
+ },
+ [NFT_MSG_DESTROYCHAIN] = {
+ .call = nf_tables_delchain,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_NEWRULE] = {
- .call_batch = nf_tables_newrule,
+ .call = nf_tables_newrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
- .call_rcu = nf_tables_getrule,
+ .call = nf_tables_getrule,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_GETRULE_RESET] = {
+ .call = nf_tables_getrule_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_DELRULE] = {
- .call_batch = nf_tables_delrule,
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_RULE_MAX,
+ .policy = nft_rule_policy,
+ },
+ [NFT_MSG_DESTROYRULE] = {
+ .call = nf_tables_delrule,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_NEWSET] = {
- .call_batch = nf_tables_newset,
+ .call = nf_tables_newset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
- .call_rcu = nf_tables_getset,
+ .call = nf_tables_getset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_DELSET] = {
- .call_batch = nf_tables_delset,
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_MAX,
+ .policy = nft_set_policy,
+ },
+ [NFT_MSG_DESTROYSET] = {
+ .call = nf_tables_delset,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_NEWSETELEM] = {
- .call_batch = nf_tables_newsetelem,
+ .call = nf_tables_newsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
- .call_rcu = nf_tables_getsetelem,
+ .call = nf_tables_getsetelem,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_GETSETELEM_RESET] = {
+ .call = nf_tables_getsetelem_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_DELSETELEM] = {
- .call_batch = nf_tables_delsetelem,
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_SET_ELEM_LIST_MAX,
+ .policy = nft_set_elem_list_policy,
+ },
+ [NFT_MSG_DESTROYSETELEM] = {
+ .call = nf_tables_delsetelem,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
- .call_rcu = nf_tables_getgen,
+ .call = nf_tables_getgen,
+ .type = NFNL_CB_RCU,
},
[NFT_MSG_NEWOBJ] = {
- .call_batch = nf_tables_newobj,
+ .call = nf_tables_newobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_DELOBJ] = {
- .call_batch = nf_tables_delobj,
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_OBJ_MAX,
+ .policy = nft_obj_policy,
+ },
+ [NFT_MSG_DESTROYOBJ] = {
+ .call = nf_tables_delobj,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
- .call_rcu = nf_tables_getobj,
+ .call = nf_tables_getobj_reset,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_NEWFLOWTABLE] = {
- .call_batch = nf_tables_newflowtable,
+ .call = nf_tables_newflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
- .call_rcu = nf_tables_getflowtable,
+ .call = nf_tables_getflowtable,
+ .type = NFNL_CB_RCU,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_DELFLOWTABLE] = {
- .call_batch = nf_tables_delflowtable,
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
+ .attr_count = NFTA_FLOWTABLE_MAX,
+ .policy = nft_flowtable_policy,
+ },
+ [NFT_MSG_DESTROYFLOWTABLE] = {
+ .call = nf_tables_delflowtable,
+ .type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
@@ -6182,99 +10088,159 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
static int nf_tables_validate(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_table *table;
- switch (net->nft.validate_state) {
- case NFT_VALIDATE_SKIP:
- break;
- case NFT_VALIDATE_NEED:
- nft_validate_state_update(net, NFT_VALIDATE_DO);
- /* fall through */
- case NFT_VALIDATE_DO:
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
+ switch (table->validate_state) {
+ case NFT_VALIDATE_SKIP:
+ continue;
+ case NFT_VALIDATE_NEED:
+ nft_validate_state_update(table, NFT_VALIDATE_DO);
+ fallthrough;
+ case NFT_VALIDATE_DO:
if (nft_table_validate(net, table) < 0)
return -EAGAIN;
+
+ nft_validate_state_update(table, NFT_VALIDATE_SKIP);
+ break;
}
- break;
}
return 0;
}
-static void nft_chain_commit_update(struct nft_trans *trans)
+/* a drop policy has to be deferred until all rules have been activated,
+ * otherwise a large ruleset that contains a drop-policy base chain will
+ * cause all packets to get dropped until the full transaction has been
+ * processed.
+ *
+ * We defer the drop policy until the transaction has been finalized.
+ */
+static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans)
+{
+ struct nft_base_chain *basechain;
+
+ if (trans->policy != NF_DROP)
+ return;
+
+ if (!nft_is_base_chain(trans->chain))
+ return;
+
+ basechain = nft_base_chain(trans->chain);
+ basechain->policy = NF_DROP;
+}
+
+static void nft_chain_commit_update(struct nft_trans_chain *trans)
{
+ struct nft_table *table = trans->nft_trans_binding.nft_trans.table;
struct nft_base_chain *basechain;
- if (nft_trans_chain_name(trans)) {
- rhltable_remove(&trans->ctx.table->chains_ht,
- &trans->ctx.chain->rhlhead,
+ if (trans->name) {
+ rhltable_remove(&table->chains_ht,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
- swap(trans->ctx.chain->name, nft_trans_chain_name(trans));
- rhltable_insert_key(&trans->ctx.table->chains_ht,
- trans->ctx.chain->name,
- &trans->ctx.chain->rhlhead,
+ swap(trans->chain->name, trans->name);
+ rhltable_insert_key(&table->chains_ht,
+ trans->chain->name,
+ &trans->chain->rhlhead,
nft_chain_ht_params);
}
- if (!nft_is_base_chain(trans->ctx.chain))
+ if (!nft_is_base_chain(trans->chain))
return;
- basechain = nft_base_chain(trans->ctx.chain);
- nft_chain_stats_replace(trans->ctx.net, basechain,
- nft_trans_chain_stats(trans));
+ nft_chain_stats_replace(trans);
+
+ basechain = nft_base_chain(trans->chain);
- switch (nft_trans_chain_policy(trans)) {
+ switch (trans->policy) {
case NF_DROP:
case NF_ACCEPT:
- basechain->policy = nft_trans_chain_policy(trans);
+ basechain->policy = trans->policy;
break;
}
}
+static void nft_obj_commit_update(const struct nft_ctx *ctx,
+ struct nft_trans *trans)
+{
+ struct nft_object *newobj;
+ struct nft_object *obj;
+
+ obj = nft_trans_obj(trans);
+ newobj = nft_trans_obj_newobj(trans);
+
+ if (WARN_ON_ONCE(!obj->ops->update))
+ return;
+
+ obj->ops->update(obj, newobj);
+ nft_obj_destroy(ctx, newobj);
+}
+
static void nft_commit_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = {
+ .net = trans->net,
+ };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_DELTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYTABLE:
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
+ free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
break;
case NFT_MSG_DELCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_DELRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ case NFT_MSG_DESTROYRULE:
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nf_tables_set_elem_destroy(&trans->ctx,
- nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv);
+ case NFT_MSG_DESTROYSETELEM:
+ nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_DELOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_DELFLOWTABLE:
- nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+ case NFT_MSG_DESTROYFLOWTABLE:
+ if (nft_trans_flowtable_update(trans))
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ else
+ nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
if (trans->put_net)
- put_net(trans->ctx.net);
+ put_net(trans->net);
kfree(trans);
}
static void nf_tables_trans_destroy_work(struct work_struct *w)
{
+ struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
struct nft_trans *trans, *next;
LIST_HEAD(head);
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_init(&nf_tables_destroy_list, &head);
+ list_splice_init(&nft_net->destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);
if (list_empty(&head))
@@ -6283,93 +10249,149 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
synchronize_rcu();
list_for_each_entry_safe(trans, next, &head, list) {
- list_del(&trans->list);
+ nft_trans_list_del(trans);
nft_commit_release(trans);
}
}
+void nf_tables_trans_destroy_flush_work(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ flush_work(&nft_net->destroy_work);
+}
+EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
+
+static bool nft_expr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ return false;
+}
+
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
+ const struct nft_expr *expr, *last;
+ struct nft_regs_track track = {};
+ unsigned int size, data_size;
+ void *data, *data_boundary;
+ struct nft_rule_dp *prule;
struct nft_rule *rule;
- unsigned int alloc = 0;
- int i;
/* already handled or inactive chain? */
- if (chain->rules_next || !nft_is_active_next(net, chain))
+ if (chain->blob_next || !nft_is_active_next(net, chain))
return 0;
- rule = list_entry(&chain->rules, struct nft_rule, list);
- i = 0;
-
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- alloc++;
+ data_size = 0;
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (nft_is_active_next(net, rule)) {
+ data_size += sizeof(*prule) + rule->dlen;
+ if (data_size > INT_MAX)
+ return -ENOMEM;
+ }
}
- chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
- if (!chain->rules_next)
+ chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size);
+ if (!chain->blob_next)
return -ENOMEM;
- list_for_each_entry_continue(rule, &chain->rules, list) {
- if (nft_is_active_next(net, rule))
- chain->rules_next[i++] = rule;
+ data = (void *)chain->blob_next->data;
+ data_boundary = data + data_size;
+ size = 0;
+
+ list_for_each_entry(rule, &chain->rules, list) {
+ if (!nft_is_active_next(net, rule))
+ continue;
+
+ prule = (struct nft_rule_dp *)data;
+ data += offsetof(struct nft_rule_dp, data);
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ size = 0;
+ track.last = nft_expr_last(rule);
+ nft_rule_for_each_expr(expr, last, rule) {
+ track.cur = expr;
+
+ if (nft_expr_reduce(&track, expr)) {
+ expr = track.cur;
+ continue;
+ }
+
+ if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
+ return -ENOMEM;
+
+ memcpy(data + size, expr, expr->ops->size);
+ size += expr->ops->size;
+ }
+ if (WARN_ON_ONCE(size >= 1 << 12))
+ return -ENOMEM;
+
+ prule->handle = rule->handle;
+ prule->dlen = size;
+ prule->is_last = 0;
+
+ data += size;
+ size = 0;
+ chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- chain->rules_next[i] = NULL;
+ if (WARN_ON_ONCE(data > data_boundary))
+ return -ENOMEM;
+
+ prule = (struct nft_rule_dp *)data;
+ nft_last_rule(chain, prule);
+
return 0;
}
static void nf_tables_commit_chain_prepare_cancel(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
- struct nft_chain *chain = trans->ctx.chain;
-
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- kvfree(chain->rules_next);
- chain->rules_next = NULL;
+ struct nft_chain *chain = nft_trans_rule_chain(trans);
+
+ kvfree(chain->blob_next);
+ chain->blob_next = NULL;
}
}
}
-static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
+static void __nf_tables_commit_chain_free_rules(struct rcu_head *h)
{
- struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
+ struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h);
- kvfree(o->start);
+ kvfree(l->blob);
}
-static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
+static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
- struct nft_rule **r = rules;
- struct nft_rules_old *old;
+ struct nft_rule_dp_last *last;
- while (*r)
- r++;
+ /* last rule trailer is after end marker */
+ last = (void *)blob + sizeof(*blob) + blob->size;
+ last->blob = blob;
- r++; /* rcu_head is after end marker */
- old = (void *) r;
- old->start = rules;
-
- call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
+ call_rcu(&last->h, __nf_tables_commit_chain_free_rules);
}
static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
- struct nft_rule **g0, **g1;
+ struct nft_rule_blob *g0, *g1;
bool next_genbit;
next_genbit = nft_gencursor_next(net);
- g0 = rcu_dereference_protected(chain->rules_gen_0,
+ g0 = rcu_dereference_protected(chain->blob_gen_0,
lockdep_commit_lock_is_held(net));
- g1 = rcu_dereference_protected(chain->rules_gen_1,
+ g1 = rcu_dereference_protected(chain->blob_gen_1,
lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
- if (chain->rules_next == NULL) {
+ if (chain->blob_next == NULL) {
/* chain had no change in last or next generation */
if (g0 == g1)
return;
@@ -6378,10 +10400,10 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
* one uses same rules as current generation.
*/
if (next_genbit) {
- rcu_assign_pointer(chain->rules_gen_1, g0);
+ rcu_assign_pointer(chain->blob_gen_1, g0);
nf_tables_commit_chain_free_rules_old(g1);
} else {
- rcu_assign_pointer(chain->rules_gen_0, g1);
+ rcu_assign_pointer(chain->blob_gen_0, g1);
nf_tables_commit_chain_free_rules_old(g0);
}
@@ -6389,11 +10411,11 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
}
if (next_genbit)
- rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
else
- rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
+ rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);
- chain->rules_next = NULL;
+ chain->blob_next = NULL;
if (g0 == g1)
return;
@@ -6404,7 +10426,13 @@ static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
nf_tables_commit_chain_free_rules_old(g0);
}
-static void nft_chain_del(struct nft_chain *chain)
+static void nft_obj_del(struct nft_object *obj)
+{
+ rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params);
+ list_del_rcu(&obj->list);
+}
+
+void nft_chain_del(struct nft_chain *chain)
{
struct nft_table *table = chain->table;
@@ -6413,8 +10441,259 @@ static void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+ struct nft_trans_gc *trans)
+{
+ struct nft_elem_priv **priv = trans->priv;
+ unsigned int i;
+
+ for (i = 0; i < trans->count; i++) {
+ nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
+ nft_setelem_remove(ctx->net, trans->set, priv[i]);
+ }
+}
+
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+ nft_set_put(trans->set);
+ put_net(trans->net);
+ kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+ struct nft_elem_priv *elem_priv;
+ struct nft_trans_gc *trans;
+ struct nft_ctx ctx = {};
+ unsigned int i;
+
+ trans = container_of(rcu, struct nft_trans_gc, rcu);
+ ctx.net = read_pnet(&trans->set->net);
+
+ for (i = 0; i < trans->count; i++) {
+ elem_priv = trans->priv[i];
+ if (!nft_setelem_is_catchall(trans->set, elem_priv))
+ atomic_dec(&trans->set->nelems);
+
+ nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
+ }
+
+ nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+ struct nftables_pernet *nft_net;
+ struct nft_ctx ctx = {};
+
+ nft_net = nft_pernet(trans->net);
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ /* Check for race with transaction, otherwise this batch refers to
+ * stale objects that might not be there anymore. Skip transaction if
+ * set has been destroyed from control plane transaction in case gc
+ * worker loses race.
+ */
+ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return false;
+ }
+
+ ctx.net = trans->net;
+ ctx.table = trans->set->table;
+
+ nft_trans_gc_setelem_remove(&ctx, trans);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+ struct nft_trans_gc *trans, *next;
+ LIST_HEAD(trans_gc_list);
+
+ spin_lock(&nf_tables_gc_list_lock);
+ list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+ list_del(&trans->list);
+ if (!nft_trans_gc_work_done(trans)) {
+ nft_trans_gc_destroy(trans);
+ continue;
+ }
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+ }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct net *net = read_pnet(&set->net);
+ struct nft_trans_gc *trans;
+
+ trans = kzalloc(sizeof(*trans), gfp);
+ if (!trans)
+ return NULL;
+
+ trans->net = maybe_get_net(net);
+ if (!trans->net) {
+ kfree(trans);
+ return NULL;
+ }
+
+ refcount_inc(&set->refs);
+ trans->set = set;
+ trans->seq = gc_seq;
+
+ return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+ trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+ spin_lock(&nf_tables_gc_list_lock);
+ list_add_tail(&trans->list, &nf_tables_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ nft_trans_gc_queue_work(gc);
+
+ return nft_trans_gc_alloc(set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+ struct nft_set *set;
+
+ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+ return NULL;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ set = gc->set;
+ call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+ return nft_trans_gc_alloc(set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ const struct nft_set *set = gc->set;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext))
+ continue;
+ if (nft_set_elem_is_dead(ext))
+ goto dead_elem;
+
+ nft_set_elem_dead(ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ return NULL;
+
+ nft_trans_gc_elem_add(gc, catchall->elem);
+ }
+
+ return gc;
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
+{
+ struct nft_set_elem_catchall *catchall, *next;
+ u64 tstamp = nft_net_tstamp(gc->net);
+ const struct nft_set *set = gc->set;
+ struct nft_elem_priv *elem_priv;
+ struct nft_set_ext *ext;
+
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));
+
+ list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!__nft_set_elem_expired(ext, tstamp))
+ continue;
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return NULL;
+
+ elem_priv = catchall->elem;
+ nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
+ nft_setelem_catchall_destroy(catchall);
+ nft_trans_gc_elem_add(gc, elem_priv);
+ }
+
+ return gc;
+}
+
+static void nf_tables_module_autoload_cleanup(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_module_request *req, *next;
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ list_for_each_entry_safe(req, next, &nft_net->module_list, list) {
+ WARN_ON_ONCE(!req->done);
+ list_del(&req->list);
+ kfree(req);
+ }
+}
+
static void nf_tables_commit_release(struct net *net)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans;
/* all side effects have to be made visible.
@@ -6424,55 +10703,253 @@ static void nf_tables_commit_release(struct net *net)
* Memory reclaim happens asynchronously from work queue
* to prevent expensive synchronize_rcu() in commit phase.
*/
- if (list_empty(&net->nft.commit_list)) {
- mutex_unlock(&net->nft.commit_mutex);
+ if (list_empty(&nft_net->commit_list)) {
+ nf_tables_module_autoload_cleanup(net);
+ mutex_unlock(&nft_net->commit_mutex);
return;
}
- trans = list_last_entry(&net->nft.commit_list,
+ trans = list_last_entry(&nft_net->commit_list,
struct nft_trans, list);
- get_net(trans->ctx.net);
+ get_net(trans->net);
WARN_ON_ONCE(trans->put_net);
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
- mutex_unlock(&net->nft.commit_mutex);
+ nf_tables_module_autoload_cleanup(net);
+ schedule_work(&nft_net->destroy_work);
+
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
+static void nft_commit_notify(struct net *net, u32 portid)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct sk_buff *batch_skb = NULL, *nskb, *skb;
+ unsigned char *data;
+ int len;
+
+ list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) {
+ if (!batch_skb) {
+new_batch:
+ batch_skb = skb;
+ len = NLMSG_GOODSIZE - skb->len;
+ list_del(&skb->list);
+ continue;
+ }
+ len -= skb->len;
+ if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
+ data = skb_put(batch_skb, skb->len);
+ memcpy(data, skb->data, skb->len);
+ list_del(&skb->list);
+ kfree_skb(skb);
+ continue;
+ }
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ goto new_batch;
+ }
+
+ if (batch_skb) {
+ nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
+ NFT_CB(batch_skb).report, GFP_KERNEL);
+ }
+
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+}
+
+static int nf_tables_commit_audit_alloc(struct list_head *adl,
+ struct nft_table *table)
+{
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ return 0;
+ }
+ adp = kzalloc(sizeof(*adp), GFP_KERNEL);
+ if (!adp)
+ return -ENOMEM;
+ adp->table = table;
+ list_add(&adp->list, adl);
+ return 0;
+}
+
+static void nf_tables_commit_audit_free(struct list_head *adl)
+{
+ struct nft_audit_data *adp, *adn;
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ list_del(&adp->list);
+ kfree(adp);
+ }
+}
+
+/* nft audit emits the number of elements that get added/removed/updated,
+ * so NEW/DELSETELEM needs to increment based on the total elem count.
+ */
+static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans)
+{
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSETELEM:
+ case NFT_MSG_DELSETELEM:
+ return nft_trans_container_elem(trans)->nelems;
+ }
+
+ return 1;
+}
+
+static void nf_tables_commit_audit_collect(struct list_head *adl,
+ const struct nft_trans *trans, u32 op)
+{
+ const struct nft_table *table = trans->table;
+ struct nft_audit_data *adp;
+
+ list_for_each_entry(adp, adl, list) {
+ if (adp->table == table)
+ goto found;
+ }
+ WARN_ONCE(1, "table=%s not expected in commit list", table->name);
+ return;
+found:
+ adp->entries += nf_tables_commit_audit_entrycount(trans);
+ if (!adp->op || adp->op > op)
+ adp->op = op;
+}
- schedule_work(&trans_destroy_work);
+#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22)
+
+static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
+{
+ struct nft_audit_data *adp, *adn;
+ char aubuf[AUNFTABLENAMELEN];
+
+ list_for_each_entry_safe(adp, adn, adl, list) {
+ snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name,
+ generation);
+ audit_log_nfcfg(aubuf, adp->table->family, adp->entries,
+ nft2audit_op[adp->op], GFP_KERNEL);
+ list_del(&adp->list);
+ kfree(adp);
+ }
+}
+
+static void nft_set_commit_update(struct list_head *set_update_list)
+{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->commit || set->dead)
+ continue;
+
+ set->ops->commit(set);
+ }
+}
+
+static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net)
+{
+ unsigned int gc_seq;
+
+ /* Bump gc counter, it becomes odd, this is the busy mark. */
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
+ return gc_seq;
+}
+
+static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq)
+{
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
}
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ const struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ struct nft_trans_binding *trans_binding;
struct nft_trans *trans, *next;
+ unsigned int base_seq, gc_seq;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
+ struct nft_ctx ctx;
+ LIST_HEAD(adl);
+ int err;
+
+ if (list_empty(&nft_net->commit_list)) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return 0;
+ }
+
+ nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL);
+
+ list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) {
+ trans = &trans_binding->nft_trans;
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWSET:
+ if (!nft_trans_set_update(trans) &&
+ nft_set_is_anonymous(nft_trans_set(trans)) &&
+ !nft_trans_set_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound set\n");
+ return -EINVAL;
+ }
+ break;
+ case NFT_MSG_NEWCHAIN:
+ if (!nft_trans_chain_update(trans) &&
+ nft_chain_binding(nft_trans_chain(trans)) &&
+ !nft_trans_chain_bound(trans)) {
+ pr_warn_once("nftables ruleset with unbound chain\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type);
+ break;
+ }
+ }
/* 0. Validate ruleset, otherwise roll back for error reporting. */
- if (nf_tables_validate(net) < 0)
+ if (nf_tables_validate(net) < 0) {
+ nft_net->validate_state = NFT_VALIDATE_DO;
return -EAGAIN;
+ }
+
+ err = nft_flow_rule_offload_commit(net);
+ if (err < 0)
+ return err;
/* 1. Allocate space for next generation rules_gen_X[] */
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ struct nft_table *table = trans->table;
int ret;
+ ret = nf_tables_commit_audit_alloc(&adl, table);
+ if (ret) {
+ nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
+ return ret;
+ }
if (trans->msg_type == NFT_MSG_NEWRULE ||
trans->msg_type == NFT_MSG_DELRULE) {
- chain = trans->ctx.chain;
+ chain = nft_trans_rule_chain(trans);
ret = nf_tables_commit_chain_prepare(net, chain);
if (ret < 0) {
nf_tables_commit_chain_prepare_cancel(net);
+ nf_tables_commit_audit_free(&adl);
return ret;
}
}
}
/* step 2. Make rules_gen_X visible to packet path */
- list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(table, &nft_net->tables, list) {
list_for_each_entry(chain, &table->chains, list)
nf_tables_commit_chain(net, chain);
}
@@ -6481,296 +10958,548 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- while (++net->nft.base_seq == 0);
+ base_seq = nft_base_seq(net);
+ while (++base_seq == 0)
+ ;
+
+ /* pairs with smp_load_acquire in nft_lookup_eval */
+ smp_store_release(&net->nft.base_seq, base_seq);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
+ nf_tables_commit_audit_collect(&adl, trans, trans->msg_type);
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
}
+ if (table->flags & NFT_TABLE_F_DORMANT)
+ nf_tables_table_disable(net, table);
+
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
} else {
- nft_clear(net, trans->ctx.table);
+ nft_clear(net, table);
}
- nf_tables_table_notify(&trans->ctx, NFT_MSG_NEWTABLE);
+ nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELTABLE:
- list_del_rcu(&trans->ctx.table->list);
- nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
+ case NFT_MSG_DESTROYTABLE:
+ list_del_rcu(&table->list);
+ nf_tables_table_notify(&ctx, trans->msg_type);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
- nft_chain_commit_update(trans);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_update(nft_trans_container_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
+ &nft_trans_chain_hooks(trans));
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
/* trans destroyed after rcu grace period */
} else {
- nft_clear(net, trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
+ nft_clear(net, nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL);
nft_trans_destroy(trans);
}
break;
case NFT_MSG_DELCHAIN:
- nft_chain_del(trans->ctx.chain);
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ &nft_trans_chain_hooks(trans));
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
+ } else {
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
+ NULL);
+ nf_tables_unregister_hook(ctx.net, ctx.table,
+ nft_trans_chain(trans));
+ }
break;
case NFT_MSG_NEWRULE:
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
+ nft_clear(net, nft_trans_rule(trans));
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
NFT_MSG_NEWRULE);
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_DELRULE:
+ case NFT_MSG_DESTROYRULE:
list_del_rcu(&nft_trans_rule(trans)->list);
- nf_tables_rule_notify(&trans->ctx,
- nft_trans_rule(trans),
- NFT_MSG_DELRULE);
+ nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
+ trans->msg_type);
+ nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans),
+ NFT_TRANS_COMMIT);
+
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_clear(net, nft_trans_set(trans));
- /* This avoids hitting -EBUSY when deleting the table
- * from the transaction.
- */
- if (nft_set_is_anonymous(nft_trans_set(trans)) &&
- !list_empty(&nft_trans_set(trans)->bindings))
- trans->ctx.table->use--;
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
+ if (nft_trans_set_update(trans)) {
+ struct nft_set *set = nft_trans_set(trans);
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
+ WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans));
+ WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans));
+
+ if (nft_trans_set_size(trans))
+ WRITE_ONCE(set->size, nft_trans_set_size(trans));
+ } else {
+ nft_clear(net, nft_trans_set(trans));
+ /* This avoids hitting -EBUSY when deleting the table
+ * from the transaction.
+ */
+ if (nft_set_is_anonymous(nft_trans_set(trans)) &&
+ !list_empty(&nft_trans_set(trans)->bindings))
+ nft_use_dec(&table->use);
+ }
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
NFT_MSG_NEWSET, GFP_KERNEL);
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSET:
+ case NFT_MSG_DESTROYSET:
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
- nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
- NFT_MSG_DELSET, GFP_KERNEL);
+ nf_tables_set_notify(&ctx, nft_trans_set(trans),
+ trans->msg_type, GFP_KERNEL);
break;
case NFT_MSG_NEWSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ te = nft_trans_container_elem(trans);
- te->set->ops->activate(net, te->set, &te->elem);
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_NEWSETELEM, 0);
+ nft_trans_elems_add(&ctx, te);
+
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
- nf_tables_setelem_notify(&trans->ctx, te->set,
- &te->elem,
- NFT_MSG_DELSETELEM, 0);
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
- te->set->ndeact--;
+ nft_trans_elems_remove(&ctx, te);
+
+ if (te->set->ops->commit &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
break;
case NFT_MSG_NEWOBJ:
- nft_clear(net, nft_trans_obj(trans));
- nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
- NFT_MSG_NEWOBJ);
- nft_trans_destroy(trans);
+ if (nft_trans_obj_update(trans)) {
+ nft_obj_commit_update(&ctx, trans);
+ nf_tables_obj_notify(&ctx,
+ nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ } else {
+ nft_clear(net, nft_trans_obj(trans));
+ nf_tables_obj_notify(&ctx,
+ nft_trans_obj(trans),
+ NFT_MSG_NEWOBJ);
+ nft_trans_destroy(trans);
+ }
break;
case NFT_MSG_DELOBJ:
- list_del_rcu(&nft_trans_obj(trans)->list);
- nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
- NFT_MSG_DELOBJ);
+ case NFT_MSG_DESTROYOBJ:
+ nft_obj_del(nft_trans_obj(trans));
+ nf_tables_obj_notify(&ctx, nft_trans_obj(trans),
+ trans->msg_type);
break;
case NFT_MSG_NEWFLOWTABLE:
- nft_clear(net, nft_trans_flowtable(trans));
- nf_tables_flowtable_notify(&trans->ctx,
- nft_trans_flowtable(trans),
- NFT_MSG_NEWFLOWTABLE);
+ if (nft_trans_flowtable_update(trans)) {
+ nft_trans_flowtable(trans)->data.flags =
+ nft_trans_flowtable_flags(trans);
+ nf_tables_flowtable_notify(&ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans),
+ NFT_MSG_NEWFLOWTABLE);
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ } else {
+ nft_clear(net, nft_trans_flowtable(trans));
+ nf_tables_flowtable_notify(&ctx,
+ nft_trans_flowtable(trans),
+ NULL,
+ NFT_MSG_NEWFLOWTABLE);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_DELFLOWTABLE:
- list_del_rcu(&nft_trans_flowtable(trans)->list);
- nf_tables_flowtable_notify(&trans->ctx,
- nft_trans_flowtable(trans),
- NFT_MSG_DELFLOWTABLE);
- nft_unregister_flowtable_net_hooks(net,
- nft_trans_flowtable(trans));
+ case NFT_MSG_DESTROYFLOWTABLE:
+ if (nft_trans_flowtable_update(trans)) {
+ nf_tables_flowtable_notify(&ctx,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans),
+ trans->msg_type);
+ nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans));
+ } else {
+ list_del_rcu(&nft_trans_flowtable(trans)->list);
+ nf_tables_flowtable_notify(&ctx,
+ nft_trans_flowtable(trans),
+ NULL,
+ trans->msg_type);
+ nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ }
break;
}
}
+ nft_set_commit_update(&set_update_list);
+
+ nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+ nf_tables_commit_audit_log(&adl, nft_base_seq(net));
+
+ nft_gc_seq_end(nft_net, gc_seq);
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
nf_tables_commit_release(net);
return 0;
}
+static void nf_tables_module_autoload(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_module_request *req, *next;
+ LIST_HEAD(module_list);
+
+ list_splice_init(&nft_net->module_list, &module_list);
+ mutex_unlock(&nft_net->commit_mutex);
+ list_for_each_entry_safe(req, next, &module_list, list) {
+ request_module("%s", req->module);
+ req->done = true;
+ }
+ mutex_lock(&nft_net->commit_mutex);
+ list_splice(&module_list, &nft_net->module_list);
+}
+
static void nf_tables_abort_release(struct nft_trans *trans)
{
+ struct nft_ctx ctx = { };
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
- nf_tables_table_destroy(&trans->ctx);
+ nf_tables_table_destroy(trans->table);
break;
case NFT_MSG_NEWCHAIN:
- nf_tables_chain_destroy(&trans->ctx);
+ if (nft_trans_chain_update(trans))
+ nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+ else
+ nf_tables_chain_destroy(nft_trans_chain(trans));
break;
case NFT_MSG_NEWRULE:
- nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
+ nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
- nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv, true);
+ nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans));
break;
case NFT_MSG_NEWOBJ:
- nft_obj_destroy(&trans->ctx, nft_trans_obj(trans));
+ nft_obj_destroy(&ctx, nft_trans_obj(trans));
break;
case NFT_MSG_NEWFLOWTABLE:
- nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans))
+ nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
+ else
+ nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
break;
}
kfree(trans);
}
-static int __nf_tables_abort(struct net *net)
+static void nft_set_abort_update(struct list_head *set_update_list)
{
+ struct nft_set *set, *next;
+
+ list_for_each_entry_safe(set, next, set_update_list, pending_update) {
+ list_del_init(&set->pending_update);
+
+ if (!set->ops->abort)
+ continue;
+
+ set->ops->abort(set);
+ }
+}
+
+static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
+ LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
+ struct nft_ctx ctx = {
+ .net = net,
+ };
+ int err = 0;
+
+ if (action == NFNL_ABORT_VALIDATE &&
+ nf_tables_validate(net) < 0)
+ err = -EAGAIN;
- list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
list) {
+ struct nft_table *table = trans->table;
+
+ nft_ctx_update(&ctx, trans);
+
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
+ if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
+ nft_trans_destroy(trans);
+ break;
}
+ if (table->flags & __NFT_TABLE_F_WAS_DORMANT) {
+ nf_tables_table_disable(net, table);
+ table->flags |= NFT_TABLE_F_DORMANT;
+ } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
+ table->flags &= ~NFT_TABLE_F_DORMANT;
+ }
+ if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ table->nlpid = 0;
+ }
+ table->flags &= ~__NFT_TABLE_F_UPDATE;
nft_trans_destroy(trans);
} else {
- list_del_rcu(&trans->ctx.table->list);
+ list_del_rcu(&table->list);
}
break;
case NFT_MSG_DELTABLE:
- nft_clear(trans->ctx.net, trans->ctx.table);
+ case NFT_MSG_DESTROYTABLE:
+ nft_clear(trans->net, table);
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
+ if (!(table->flags & NFT_TABLE_F_DORMANT)) {
+ nft_netdev_unregister_hooks(net,
+ &nft_trans_chain_hooks(trans),
+ true);
+ }
free_percpu(nft_trans_chain_stats(trans));
kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
} else {
- trans->ctx.table->use--;
- nft_chain_del(trans->ctx.chain);
- nf_tables_unregister_hook(trans->ctx.net,
- trans->ctx.table,
- trans->ctx.chain);
+ if (nft_trans_chain_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&table->use);
+ nft_chain_del(nft_trans_chain(trans));
+ nf_tables_unregister_hook(trans->net, table,
+ nft_trans_chain(trans));
}
break;
case NFT_MSG_DELCHAIN:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, trans->ctx.chain);
+ case NFT_MSG_DESTROYCHAIN:
+ if (nft_trans_chain_update(trans)) {
+ list_splice(&nft_trans_chain_hooks(trans),
+ &nft_trans_basechain(trans)->hook_list);
+ } else {
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_chain(trans));
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWRULE:
- trans->ctx.chain->use--;
+ if (nft_trans_rule_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&nft_trans_rule_chain(trans)->use);
list_del_rcu(&nft_trans_rule(trans)->list);
- nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans));
+ nft_rule_expr_deactivate(&ctx,
+ nft_trans_rule(trans),
+ NFT_TRANS_ABORT);
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
break;
case NFT_MSG_DELRULE:
- trans->ctx.chain->use++;
- nft_clear(trans->ctx.net, nft_trans_rule(trans));
- nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans));
+ case NFT_MSG_DESTROYRULE:
+ nft_use_inc_restore(&nft_trans_rule_chain(trans)->use);
+ nft_clear(trans->net, nft_trans_rule(trans));
+ nft_rule_expr_activate(&ctx, nft_trans_rule(trans));
+ if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
+ nft_flow_rule_destroy(nft_trans_flow_rule(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSET:
- trans->ctx.table->use--;
+ list_del(&nft_trans_container_set(trans)->list_trans_newset);
+ if (nft_trans_set_update(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_use_dec_restore(&table->use);
+ if (nft_trans_set_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
break;
case NFT_MSG_DELSET:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_set(trans));
+ case NFT_MSG_DESTROYSET:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_set(trans));
+ if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_activate(&ctx, nft_trans_set(trans));
+
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ if (nft_trans_elem_set_bound(trans)) {
+ nft_trans_destroy(trans);
+ break;
+ }
+ te = nft_trans_container_elem(trans);
+ if (!nft_trans_elems_new_abort(&ctx, te)) {
+ nft_trans_destroy(trans);
+ break;
+ }
- te->set->ops->remove(net, te->set, &te->elem);
- atomic_dec(&te->set->nelems);
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
break;
case NFT_MSG_DELSETELEM:
- te = (struct nft_trans_elem *)trans->data;
+ case NFT_MSG_DESTROYSETELEM:
+ te = nft_trans_container_elem(trans);
- nft_set_elem_activate(net, te->set, &te->elem);
- te->set->ops->activate(net, te->set, &te->elem);
- te->set->ndeact--;
+ nft_trans_elems_destroy_abort(&ctx, te);
+ if (te->set->ops->abort &&
+ list_empty(&te->set->pending_update)) {
+ list_add_tail(&te->set->pending_update,
+ &set_update_list);
+ }
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWOBJ:
- trans->ctx.table->use--;
- list_del_rcu(&nft_trans_obj(trans)->list);
+ if (nft_trans_obj_update(trans)) {
+ nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans));
+ nft_trans_destroy(trans);
+ } else {
+ nft_use_dec_restore(&table->use);
+ nft_obj_del(nft_trans_obj(trans));
+ }
break;
case NFT_MSG_DELOBJ:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_obj(trans));
+ case NFT_MSG_DESTROYOBJ:
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_obj(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWFLOWTABLE:
- trans->ctx.table->use--;
- list_del_rcu(&nft_trans_flowtable(trans)->list);
- nft_unregister_flowtable_net_hooks(net,
- nft_trans_flowtable(trans));
+ if (nft_trans_flowtable_update(trans)) {
+ nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable_hooks(trans));
+ } else {
+ nft_use_dec_restore(&table->use);
+ list_del_rcu(&nft_trans_flowtable(trans)->list);
+ nft_unregister_flowtable_net_hooks(net,
+ nft_trans_flowtable(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ }
break;
case NFT_MSG_DELFLOWTABLE:
- trans->ctx.table->use++;
- nft_clear(trans->ctx.net, nft_trans_flowtable(trans));
+ case NFT_MSG_DESTROYFLOWTABLE:
+ if (nft_trans_flowtable_update(trans)) {
+ list_splice(&nft_trans_flowtable_hooks(trans),
+ &nft_trans_flowtable(trans)->hook_list);
+ } else {
+ nft_use_inc_restore(&table->use);
+ nft_clear(trans->net, nft_trans_flowtable(trans));
+ }
nft_trans_destroy(trans);
break;
}
}
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ nft_set_abort_update(&set_update_list);
+
synchronize_rcu();
list_for_each_entry_safe_reverse(trans, next,
- &net->nft.commit_list, list) {
- list_del(&trans->list);
+ &nft_net->commit_list, list) {
+ nft_trans_list_del(trans);
nf_tables_abort_release(trans);
}
- return 0;
+ return err;
}
-static void nf_tables_cleanup(struct net *net)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb,
+ enum nfnl_abort_action action)
{
- nft_validate_state_update(net, NFT_VALIDATE_SKIP);
-}
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ unsigned int gc_seq;
+ int ret;
-static int nf_tables_abort(struct net *net, struct sk_buff *skb)
-{
- int ret = __nf_tables_abort(net);
+ gc_seq = nft_gc_seq_begin(nft_net);
+ ret = __nf_tables_abort(net, action);
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+
+ /* module autoload needs to happen after GC sequence update because it
+ * temporarily releases and grabs mutex again.
+ */
+ if (action == NFNL_ABORT_AUTOLOAD)
+ nf_tables_module_autoload(net);
+ else
+ nf_tables_module_autoload_cleanup(net);
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
return ret;
}
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
+ struct nftables_pernet *nft_net = nft_pernet(net);
bool genid_ok;
- mutex_lock(&net->nft.commit_mutex);
+ mutex_lock(&nft_net->commit_mutex);
+ nft_net->tstamp = get_jiffies_64();
- genid_ok = genid == 0 || net->nft.base_seq == genid;
+ genid_ok = genid == 0 || nft_base_seq(net) == genid;
if (!genid_ok)
- mutex_unlock(&net->nft.commit_mutex);
+ mutex_unlock(&nft_net->commit_mutex);
/* else, commit mutex has to be released by commit or abort function */
return genid_ok;
@@ -6783,7 +11512,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
- .cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
.owner = THIS_MODULE,
};
@@ -6820,106 +11548,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain,
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);
-/*
- * Loop detection - walk through the ruleset beginning at the destination chain
- * of a new jump until either the source chain is reached (loop) or all
- * reachable chains have been traversed.
- *
- * The loop check is performed whenever a new jump verdict is added to an
- * expression or verdict map or a verdict map is bound to a new chain.
- */
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain);
-
-static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- const struct nft_data *data;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- return nf_tables_check_loops(ctx, data->verdict.chain);
- default:
- return 0;
- }
-}
-
-static int nf_tables_check_loops(const struct nft_ctx *ctx,
- const struct nft_chain *chain)
-{
- const struct nft_rule *rule;
- const struct nft_expr *expr, *last;
- struct nft_set *set;
- struct nft_set_binding *binding;
- struct nft_set_iter iter;
-
- if (ctx->chain == chain)
- return -ELOOP;
-
- list_for_each_entry(rule, &chain->rules, list) {
- nft_rule_for_each_expr(expr, last, rule) {
- struct nft_immediate_expr *priv;
- const struct nft_data *data;
- int err;
-
- if (strcmp(expr->ops->type->name, "immediate"))
- continue;
-
- priv = nft_expr_priv(expr);
- if (priv->dreg != NFT_REG_VERDICT)
- continue;
-
- data = &priv->data;
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- err = nf_tables_check_loops(ctx,
- data->verdict.chain);
- if (err < 0)
- return err;
- default:
- break;
- }
- }
- }
-
- list_for_each_entry(set, &ctx->table->sets, list) {
- if (!nft_is_active_next(ctx->net, set))
- continue;
- if (!(set->flags & NFT_SET_MAP) ||
- set->dtype != NFT_DATA_VERDICT)
- continue;
-
- list_for_each_entry(binding, &set->bindings, list) {
- if (!(binding->flags & NFT_SET_MAP) ||
- binding->chain != chain)
- continue;
-
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nf_tables_loop_check_setelem;
-
- set->ops->walk(ctx, set, &iter);
- if (iter.err < 0)
- return iter.err;
- }
- }
-
- return 0;
-}
-
/**
* nft_parse_u32_check - fetch u32 attribute and check for maximum value
*
@@ -6945,28 +11573,24 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);
-/**
- * nft_parse_register - parse a register value from a netlink attribute
- *
- * @attr: netlink attribute
- *
- * Parse and translate a register value from a netlink attribute.
- * Registers used to be 128 bit wide, these register numbers will be
- * mapped to the corresponding 32 bit register numbers.
- */
-unsigned int nft_parse_register(const struct nlattr *attr)
+static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
unsigned int reg;
reg = ntohl(nla_get_be32(attr));
switch (reg) {
case NFT_REG_VERDICT...NFT_REG_4:
- return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
+ break;
+ case NFT_REG32_00...NFT_REG32_15:
+ *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ break;
default:
- return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
+ return -ERANGE;
}
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_parse_register);
/**
* nft_dump_register - dump a register value to a netlink attribute
@@ -6990,46 +11614,67 @@ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
}
EXPORT_SYMBOL_GPL(nft_dump_register);
-/**
- * nft_validate_register_load - validate a load from a register
- *
- * @reg: the register number
- * @len: the length of the data
- *
- * Validate that the input register is one of the general purpose
- * registers and that the length of the load is within the bounds.
- */
-int nft_validate_register_load(enum nft_registers reg, unsigned int len)
+static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
{
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
if (len == 0)
return -EINVAL;
- if (reg * NFT_REG32_SIZE + len > FIELD_SIZEOF(struct nft_regs, data))
+ if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data))
return -ERANGE;
return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_register_load);
-/**
- * nft_validate_register_store - validate an expressions' register store
- *
- * @ctx: context of the expression performing the load
- * @reg: the destination register number
- * @data: the data to load
- * @type: the data type
- * @len: the length of the data
- *
- * Validate that a data load uses the appropriate data type for
- * the destination register and the length is within the bounds.
- * A value of NULL for the data means that its runtime gathered
- * data.
- */
-int nft_validate_register_store(const struct nft_ctx *ctx,
- enum nft_registers reg,
- const struct nft_data *data,
- enum nft_data_types type, unsigned int len)
+int nft_parse_register_load(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *sreg, u32 len)
+{
+ int err, invalid_reg;
+ u32 reg, next_register;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_load(reg, len);
+ if (err < 0)
+ return err;
+
+ next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg;
+
+ /* Can't happen: nft_validate_register_load() should have failed */
+ if (WARN_ON_ONCE(next_register > NFT_REG32_NUM))
+ return -EINVAL;
+
+ /* find first register that did not see an earlier store. */
+ invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg);
+
+ /* invalid register within the range that we're loading from? */
+ if (invalid_reg < next_register)
+ return -ENODATA;
+
+ *sreg = reg;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nft_parse_register_load);
+
+static void nft_saw_register_store(const struct nft_ctx *__ctx,
+ int reg, unsigned int len)
+{
+ unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE);
+ struct nft_ctx *ctx = (struct nft_ctx *)__ctx;
+
+ if (WARN_ON_ONCE(len == 0 || reg < 0))
+ return;
+
+ bitmap_set(ctx->reg_inited, reg, registers);
+}
+
+static int nft_validate_register_store(const struct nft_ctx *ctx,
+ enum nft_registers reg,
+ const struct nft_data *data,
+ enum nft_data_types type,
+ unsigned int len)
{
int err;
@@ -7041,32 +11686,57 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
if (data != NULL &&
(data->verdict.code == NFT_GOTO ||
data->verdict.code == NFT_JUMP)) {
- err = nf_tables_check_loops(ctx, data->verdict.chain);
+ err = nft_chain_validate(ctx, data->verdict.chain);
if (err < 0)
return err;
}
- return 0;
+ break;
default:
+ if (type != NFT_DATA_VALUE)
+ return -EINVAL;
+
if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
return -EINVAL;
if (len == 0)
return -EINVAL;
if (reg * NFT_REG32_SIZE + len >
- FIELD_SIZEOF(struct nft_regs, data))
+ sizeof_field(struct nft_regs, data))
return -ERANGE;
- if (data != NULL && type != NFT_DATA_VALUE)
- return -EINVAL;
- return 0;
+ break;
}
+
+ nft_saw_register_store(ctx, reg, len);
+ return 0;
+}
+
+int nft_parse_register_store(const struct nft_ctx *ctx,
+ const struct nlattr *attr, u8 *dreg,
+ const struct nft_data *data,
+ enum nft_data_types type, unsigned int len)
+{
+ int err;
+ u32 reg;
+
+ err = nft_parse_register(attr, &reg);
+ if (err < 0)
+ return err;
+
+ err = nft_validate_register_store(ctx, reg, data, type, len);
+ if (err < 0)
+ return err;
+
+ *dreg = reg;
+ return 0;
}
-EXPORT_SYMBOL_GPL(nft_validate_register_store);
+EXPORT_SYMBOL_GPL(nft_parse_register_store);
static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
[NFTA_VERDICT_CODE] = { .type = NLA_U32 },
[NFTA_VERDICT_CHAIN] = { .type = NLA_STRING,
.len = NFT_CHAIN_MAXNAMELEN - 1 },
+ [NFTA_VERDICT_CHAIN_ID] = { .type = NLA_U32 },
};
static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
@@ -7077,57 +11747,75 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_chain *chain;
int err;
- err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla,
+ nft_verdict_policy, NULL);
if (err < 0)
return err;
if (!tb[NFTA_VERDICT_CODE])
return -EINVAL;
+
+ /* zero padding hole for memcmp */
+ memset(data, 0, sizeof(*data));
data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));
switch (data->verdict.code) {
- default:
- switch (data->verdict.code & NF_VERDICT_MASK) {
- case NF_ACCEPT:
- case NF_DROP:
- case NF_QUEUE:
- break;
- default:
- return -EINVAL;
- }
- /* fall through */
+ case NF_ACCEPT:
+ case NF_DROP:
+ case NF_QUEUE:
+ break;
case NFT_CONTINUE:
case NFT_BREAK:
case NFT_RETURN:
break;
case NFT_JUMP:
case NFT_GOTO:
- if (!tb[NFTA_VERDICT_CHAIN])
+ if (tb[NFTA_VERDICT_CHAIN]) {
+ chain = nft_chain_lookup(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN],
+ genmask);
+ } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
+ chain = nft_chain_lookup_byid(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN_ID],
+ genmask);
+ if (IS_ERR(chain))
+ return PTR_ERR(chain);
+ } else {
return -EINVAL;
- chain = nft_chain_lookup(ctx->net, ctx->table,
- tb[NFTA_VERDICT_CHAIN], genmask);
+ }
+
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
return -EOPNOTSUPP;
+ if (nft_chain_is_bound(chain))
+ return -EINVAL;
+ if (desc->flags & NFT_DATA_DESC_SETELEM &&
+ chain->flags & NFT_CHAIN_BINDING)
+ return -EINVAL;
+ if (!nft_use_inc(&chain->use))
+ return -EMFILE;
- chain->use++;
data->verdict.chain = chain;
break;
+ default:
+ return -EINVAL;
}
desc->len = sizeof(data->verdict);
- desc->type = NFT_DATA_VERDICT;
+
return 0;
}
static void nft_verdict_uninit(const struct nft_data *data)
{
+ struct nft_chain *chain;
+
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- data->verdict.chain->use--;
+ chain = data->verdict.chain;
+ nft_use_dec(&chain->use);
break;
}
}
@@ -7136,7 +11824,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, type);
+ nest = nla_nest_start_noflag(skb, type);
if (!nest)
goto nla_put_failure;
@@ -7158,20 +11846,25 @@ nla_put_failure:
}
static int nft_value_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
- struct nft_data_desc *desc, const struct nlattr *nla)
+ struct nft_data *data, struct nft_data_desc *desc,
+ const struct nlattr *nla)
{
unsigned int len;
len = nla_len(nla);
if (len == 0)
return -EINVAL;
- if (len > size)
+ if (len > desc->size)
return -EOVERFLOW;
+ if (desc->len) {
+ if (len != desc->len)
+ return -EINVAL;
+ } else {
+ desc->len = len;
+ }
nla_memcpy(data->data, nla, len);
- desc->type = NFT_DATA_VALUE;
- desc->len = len;
+
return 0;
}
@@ -7191,7 +11884,6 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
*
* @ctx: context of the expression using the data
* @data: destination struct nft_data
- * @size: maximum data length
* @desc: data description
* @nla: netlink attribute containing data
*
@@ -7201,23 +11893,35 @@ static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
* The caller can indicate that it only wants to accept data of type
* NFT_DATA_VALUE by passing NULL for the ctx argument.
*/
-int nft_data_init(const struct nft_ctx *ctx,
- struct nft_data *data, unsigned int size,
+int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nft_data_desc *desc, const struct nlattr *nla)
{
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL);
+ if (WARN_ON_ONCE(!desc->size))
+ return -EINVAL;
+
+ err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
+ nft_data_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_DATA_VALUE])
- return nft_value_init(ctx, data, size, desc,
- tb[NFTA_DATA_VALUE]);
- if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
- return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
- return -EINVAL;
+ if (tb[NFTA_DATA_VALUE]) {
+ if (desc->type != NFT_DATA_VALUE)
+ return -EINVAL;
+
+ err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
+ } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
+ if (desc->type != NFT_DATA_VERDICT)
+ return -EINVAL;
+
+ err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
+ } else {
+ err = -EINVAL;
+ }
+
+ return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);
@@ -7249,7 +11953,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
struct nlattr *nest;
int err;
- nest = nla_nest_start(skb, attr);
+ nest = nla_nest_start_noflag(skb, attr);
if (nest == NULL)
return -1;
@@ -7270,31 +11974,35 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-int __nft_release_basechain(struct nft_ctx *ctx)
+static void __nft_release_hook(struct net *net, struct nft_table *table)
{
- struct nft_rule *rule, *nr;
+ struct nft_flowtable *flowtable;
+ struct nft_chain *chain;
- if (WARN_ON(!nft_is_base_chain(ctx->chain)))
- return 0;
+ list_for_each_entry(chain, &table->chains, list)
+ __nf_tables_unregister_hook(net, table, chain, true);
+ list_for_each_entry(flowtable, &table->flowtables, list)
+ __nft_unregister_flowtable_net_hooks(net, flowtable,
+ &flowtable->hook_list,
+ true);
+}
- nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
- list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
- list_del(&rule->list);
- ctx->chain->use--;
- nf_tables_rule_release(ctx, rule);
- }
- nft_chain_del(ctx->chain);
- ctx->table->use--;
- nf_tables_chain_destroy(ctx);
+static void __nft_release_hooks(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table;
- return 0;
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
+ __nft_release_hook(net, table);
+ }
}
-EXPORT_SYMBOL_GPL(__nft_release_basechain);
-static void __nft_release_tables(struct net *net)
+static void __nft_release_table(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable, *nf;
- struct nft_table *table, *nt;
struct nft_chain *chain, *nc;
struct nft_object *obj, *ne;
struct nft_rule *rule, *nr;
@@ -7304,107 +12012,249 @@ static void __nft_release_tables(struct net *net)
.family = NFPROTO_NETDEV,
};
- list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
- ctx.family = table->family;
+ ctx.family = table->family;
+ ctx.table = table;
+ list_for_each_entry(chain, &table->chains, list) {
+ if (nft_chain_binding(chain))
+ continue;
- list_for_each_entry(chain, &table->chains, list)
- nf_tables_unregister_hook(net, table, chain);
- /* No packets are walking on these chains anymore. */
- ctx.table = table;
- list_for_each_entry(chain, &table->chains, list) {
- ctx.chain = chain;
- list_for_each_entry_safe(rule, nr, &chain->rules, list) {
- list_del(&rule->list);
- chain->use--;
- nf_tables_rule_release(&ctx, rule);
- }
- }
- list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
- list_del(&flowtable->list);
- table->use--;
- nf_tables_flowtable_destroy(flowtable);
- }
- list_for_each_entry_safe(set, ns, &table->sets, list) {
- list_del(&set->list);
- table->use--;
- nft_set_destroy(set);
- }
- list_for_each_entry_safe(obj, ne, &table->objects, list) {
- list_del(&obj->list);
- table->use--;
- nft_obj_destroy(&ctx, obj);
- }
- list_for_each_entry_safe(chain, nc, &table->chains, list) {
- ctx.chain = chain;
- nft_chain_del(chain);
- table->use--;
- nf_tables_chain_destroy(&ctx);
+ ctx.chain = chain;
+ list_for_each_entry_safe(rule, nr, &chain->rules, list) {
+ list_del(&rule->list);
+ nft_use_dec(&chain->use);
+ nf_tables_rule_release(&ctx, rule);
}
+ }
+ list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
+ list_del(&flowtable->list);
+ nft_use_dec(&table->use);
+ nf_tables_flowtable_destroy(flowtable);
+ }
+ list_for_each_entry_safe(set, ns, &table->sets, list) {
+ list_del(&set->list);
+ nft_use_dec(&table->use);
+ if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
+ nft_map_deactivate(&ctx, set);
+
+ nft_set_destroy(&ctx, set);
+ }
+ list_for_each_entry_safe(obj, ne, &table->objects, list) {
+ nft_obj_del(obj);
+ nft_use_dec(&table->use);
+ nft_obj_destroy(&ctx, obj);
+ }
+ list_for_each_entry_safe(chain, nc, &table->chains, list) {
+ nft_chain_del(chain);
+ nft_use_dec(&table->use);
+ nf_tables_chain_destroy(chain);
+ }
+ nf_tables_table_destroy(table);
+}
+
+static void __nft_release_tables(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_table *table, *nt;
+
+ list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
+ if (nft_table_has_owner(table))
+ continue;
+
list_del(&table->list);
- nf_tables_table_destroy(&ctx);
+
+ __nft_release_table(net, table);
}
}
+static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct nft_table *table, *to_delete[8];
+ struct nftables_pernet *nft_net;
+ struct netlink_notify *n = ptr;
+ struct net *net = n->net;
+ unsigned int deleted;
+ bool restart = false;
+ unsigned int gc_seq;
+
+ if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(net);
+ deleted = 0;
+ mutex_lock(&nft_net->commit_mutex);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ nf_tables_trans_destroy_flush_work(net);
+again:
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (nft_table_has_owner(table) &&
+ n->portid == table->nlpid) {
+ if (table->flags & NFT_TABLE_F_PERSIST) {
+ table->flags &= ~NFT_TABLE_F_OWNER;
+ continue;
+ }
+ __nft_release_hook(net, table);
+ list_del_rcu(&table->list);
+ to_delete[deleted++] = table;
+ if (deleted >= ARRAY_SIZE(to_delete))
+ break;
+ }
+ }
+ if (deleted) {
+ restart = deleted >= ARRAY_SIZE(to_delete);
+ synchronize_rcu();
+ while (deleted)
+ __nft_release_table(net, to_delete[--deleted]);
+
+ if (restart)
+ goto again;
+ }
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nft_nl_notifier = {
+ .notifier_call = nft_rcv_nl_event,
+};
+
static int __net_init nf_tables_init_net(struct net *net)
{
- INIT_LIST_HEAD(&net->nft.tables);
- INIT_LIST_HEAD(&net->nft.commit_list);
- mutex_init(&net->nft.commit_mutex);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ INIT_LIST_HEAD(&nft_net->tables);
+ INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->destroy_list);
+ INIT_LIST_HEAD(&nft_net->commit_set_list);
+ INIT_LIST_HEAD(&nft_net->binding_list);
+ INIT_LIST_HEAD(&nft_net->module_list);
+ INIT_LIST_HEAD(&nft_net->notify_list);
+ mutex_init(&nft_net->commit_mutex);
net->nft.base_seq = 1;
- net->nft.validate_state = NFT_VALIDATE_SKIP;
+ nft_net->gc_seq = 0;
+ nft_net->validate_state = NFT_VALIDATE_SKIP;
+ INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
return 0;
}
+static void __net_exit nf_tables_pre_exit_net(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ mutex_lock(&nft_net->commit_mutex);
+ __nft_release_hooks(net);
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
static void __net_exit nf_tables_exit_net(struct net *net)
{
- mutex_lock(&net->nft.commit_mutex);
- if (!list_empty(&net->nft.commit_list))
- __nf_tables_abort(net);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ unsigned int gc_seq;
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ gc_seq = nft_gc_seq_begin(nft_net);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));
+
+ if (!list_empty(&nft_net->module_list))
+ nf_tables_module_autoload_cleanup(net);
+
+ cancel_work_sync(&nft_net->destroy_work);
__nft_release_tables(net);
- mutex_unlock(&net->nft.commit_mutex);
- WARN_ON_ONCE(!list_empty(&net->nft.tables));
+
+ nft_gc_seq_end(nft_net, gc_seq);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ WARN_ON_ONCE(!list_empty(&nft_net->tables));
+ WARN_ON_ONCE(!list_empty(&nft_net->module_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
+}
+
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+ flush_work(&trans_gc_work);
}
static struct pernet_operations nf_tables_net_ops = {
- .init = nf_tables_init_net,
- .exit = nf_tables_exit_net,
+ .init = nf_tables_init_net,
+ .pre_exit = nf_tables_pre_exit_net,
+ .exit = nf_tables_exit_net,
+ .exit_batch = nf_tables_exit_batch,
+ .id = &nf_tables_net_id,
+ .size = sizeof(struct nftables_pernet),
};
static int __init nf_tables_module_init(void)
{
int err;
- spin_lock_init(&nf_tables_destroy_list_lock);
+ BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0);
+ BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0);
+
err = register_pernet_subsys(&nf_tables_net_ops);
if (err < 0)
return err;
err = nft_chain_filter_init();
if (err < 0)
- goto err1;
+ goto err_chain_filter;
err = nf_tables_core_module_init();
if (err < 0)
- goto err2;
+ goto err_core_module;
err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
if (err < 0)
- goto err3;
+ goto err_netdev_notifier;
+
+ err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params);
+ if (err < 0)
+ goto err_rht_objname;
+
+ err = nft_offload_init();
+ if (err < 0)
+ goto err_offload;
+
+ err = netlink_register_notifier(&nft_nl_notifier);
+ if (err < 0)
+ goto err_netlink_notifier;
/* must be last */
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
- goto err4;
+ goto err_nfnl_subsys;
+
+ nft_chain_route_init();
return err;
-err4:
+
+err_nfnl_subsys:
+ netlink_unregister_notifier(&nft_nl_notifier);
+err_netlink_notifier:
+ nft_offload_exit();
+err_offload:
+ rhltable_destroy(&nft_objname_ht);
+err_rht_objname:
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
-err3:
+err_netdev_notifier:
nf_tables_core_module_exit();
-err2:
+err_core_module:
nft_chain_filter_fini();
-err1:
+err_chain_filter:
unregister_pernet_subsys(&nf_tables_net_ops);
return err;
}
@@ -7412,11 +12262,15 @@ err1:
static void __exit nf_tables_module_exit(void)
{
nfnetlink_subsys_unregister(&nf_tables_subsys);
+ netlink_unregister_notifier(&nft_nl_notifier);
+ nft_offload_exit();
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
+ nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
- cancel_work_sync(&trans_destroy_work);
+ cancel_work_sync(&trans_gc_work);
rcu_barrier();
+ rhltable_destroy(&nft_objname_ht);
nf_tables_core_module_exit();
}
@@ -7425,4 +12279,5 @@ module_exit(nf_tables_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Framework for packet filtering and classification");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index a50500232b0a..6557a4018c09 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -22,44 +19,128 @@
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
+#include <net/netfilter/nft_meta.h>
-static noinline void __nft_trace_packet(struct nft_traceinfo *info,
- const struct nft_chain *chain,
- enum nft_trace_types type)
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static struct static_key_false nf_tables_skip_direct_calls;
+
+static inline bool nf_skip_indirect_calls(void)
{
- const struct nft_pktinfo *pkt = info->pkt;
+ return static_branch_likely(&nf_tables_skip_direct_calls);
+}
- if (!info->trace || !pkt->skb->nf_trace)
+static inline void __init nf_skip_indirect_calls_enable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+ static_branch_enable(&nf_tables_skip_direct_calls);
+}
+#else
+static inline void nf_skip_indirect_calls_enable(void) { }
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+
+static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info,
+ enum nft_trace_types type)
+{
+ if (!info->trace || !info->nf_trace)
return;
- info->chain = chain;
info->type = type;
- nft_trace_notify(info);
+ nft_trace_notify(pkt, verdict, rule, info);
}
-static inline void nft_trace_packet(struct nft_traceinfo *info,
- const struct nft_chain *chain,
- const struct nft_rule *rule,
+static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+ struct nft_verdict *verdict,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
enum nft_trace_types type)
{
if (static_branch_unlikely(&nft_trace_enabled)) {
- info->rule = rule;
- __nft_trace_packet(info, chain, type);
+ info->nf_trace = pkt->skb->nf_trace;
+ __nft_trace_packet(pkt, verdict, rule, info, type);
}
}
+static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info)
+{
+ if (static_branch_unlikely(&nft_trace_enabled))
+ info->nf_trace = pkt->skb->nf_trace;
+}
+
+static void nft_bitwise_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ u32 *src = &regs->data[priv->sreg];
+ u32 *dst = &regs->data[priv->dreg];
+
+ *dst = (*src & priv->mask) ^ priv->xor;
+}
+
static void nft_cmp_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- u32 mask = nft_cmp_fast_mask(priv->len);
- if ((regs->data[priv->sreg] & mask) == priv->data)
+ if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv)
return;
regs->verdict.code = NFT_BREAK;
}
+static void nft_cmp16_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ const u64 *reg_data = (const u64 *)&regs->data[priv->sreg];
+ const u64 *mask = (const u64 *)&priv->mask;
+ const u64 *data = (const u64 *)&priv->data;
+
+ if (((reg_data[0] & mask[0]) == data[0] &&
+ ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv)
+ return;
+ regs->verdict.code = NFT_BREAK;
+}
+
+static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ enum nft_trace_types type;
+
+ switch (regs->verdict.code & NF_VERDICT_MASK) {
+ case NFT_CONTINUE:
+ case NFT_RETURN:
+ type = NFT_TRACETYPE_RETURN;
+ break;
+ case NF_STOLEN:
+ type = NFT_TRACETYPE_RULE;
+ /* can't access skb->nf_trace; use copy */
+ break;
+ default:
+ type = NFT_TRACETYPE_RULE;
+
+ if (info->trace)
+ info->nf_trace = pkt->skb->nf_trace;
+ break;
+ }
+
+ __nft_trace_packet(pkt, &regs->verdict, rule, info, type);
+}
+
+static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
+ struct nft_traceinfo *info,
+ const struct nft_rule_dp *rule,
+ const struct nft_regs *regs)
+{
+ if (static_branch_unlikely(&nft_trace_enabled))
+ __nft_trace_verdict(pkt, info, rule, regs);
+}
+
static bool nft_payload_fast_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -72,9 +153,9 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
if (priv->base == NFT_PAYLOAD_NETWORK_HEADER)
ptr = skb_network_header(skb);
else {
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
return false;
- ptr = skb_network_header(skb) + pkt->xt.thoff;
+ ptr = skb->data + nft_thoff(pkt);
}
ptr += priv->offset;
@@ -98,73 +179,107 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
const struct nft_pktinfo *pkt)
{
struct nft_base_chain *base_chain;
+ struct nft_stats __percpu *pstats;
struct nft_stats *stats;
base_chain = nft_base_chain(chain);
- if (!rcu_access_pointer(base_chain->stats))
- return;
- local_bh_disable();
- stats = this_cpu_ptr(rcu_dereference(base_chain->stats));
- if (stats) {
+ pstats = READ_ONCE(base_chain->stats);
+ if (pstats) {
+ local_bh_disable();
+ stats = this_cpu_ptr(pstats);
u64_stats_update_begin(&stats->syncp);
stats->pkts++;
stats->bytes += pkt->skb->len;
u64_stats_update_end(&stats->syncp);
+ local_bh_enable();
}
- local_bh_enable();
}
struct nft_jumpstack {
- const struct nft_chain *chain;
- struct nft_rule *const *rules;
+ const struct nft_rule_dp *rule;
};
static void expr_call_ops_eval(const struct nft_expr *expr,
struct nft_regs *regs,
struct nft_pktinfo *pkt)
{
- unsigned long e = (unsigned long)expr->ops->eval;
-
- if (e == (unsigned long)nft_meta_get_eval)
- nft_meta_get_eval(expr, regs, pkt);
- else if (e == (unsigned long)nft_lookup_eval)
- nft_lookup_eval(expr, regs, pkt);
- else
- expr->ops->eval(expr, regs, pkt);
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ unsigned long e;
+
+ if (nf_skip_indirect_calls())
+ goto indirect_call;
+
+ e = (unsigned long)expr->ops->eval;
+#define X(e, fun) \
+ do { if ((e) == (unsigned long)(fun)) \
+ return fun(expr, regs, pkt); } while (0)
+
+ X(e, nft_payload_eval);
+ X(e, nft_cmp_eval);
+ X(e, nft_counter_eval);
+ X(e, nft_meta_get_eval);
+ X(e, nft_lookup_eval);
+#if IS_ENABLED(CONFIG_NFT_CT)
+ X(e, nft_ct_get_fast_eval);
+#endif
+ X(e, nft_range_eval);
+ X(e, nft_immediate_eval);
+ X(e, nft_byteorder_eval);
+ X(e, nft_dynset_eval);
+ X(e, nft_rt_get_eval);
+ X(e, nft_bitwise_eval);
+ X(e, nft_objref_eval);
+ X(e, nft_objref_map_eval);
+#undef X
+indirect_call:
+#endif /* CONFIG_MITIGATION_RETPOLINE */
+ expr->ops->eval(expr, regs, pkt);
}
+#define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0]
+#define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size
+#define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen]
+
+#define nft_rule_dp_for_each_expr(expr, last, rule) \
+ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
+ (expr) != (last); \
+ (expr) = nft_rule_expr_next(expr))
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct net *net = nft_net(pkt);
- struct nft_rule *const *rules;
- const struct nft_rule *rule;
const struct nft_expr *expr, *last;
+ const struct nft_rule_dp *rule;
struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
+ struct nft_rule_blob *blob;
struct nft_traceinfo info;
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
- nft_trace_init(&info, pkt, &regs.verdict, basechain);
+ nft_trace_init(&info, pkt, basechain);
do_chain:
if (genbit)
- rules = rcu_dereference(chain->rules_gen_1);
+ blob = rcu_dereference(chain->blob_gen_1);
else
- rules = rcu_dereference(chain->rules_gen_0);
+ blob = rcu_dereference(chain->blob_gen_0);
+ rule = (struct nft_rule_dp *)blob->data;
next_rule:
- rule = *rules;
regs.verdict.code = NFT_CONTINUE;
- for (; *rules ; rules++) {
- rule = *rules;
- nft_rule_for_each_expr(expr, last, rule) {
+ for (; !rule->is_last ; rule = nft_rule_next(rule)) {
+ nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_cmp16_fast_ops)
+ nft_cmp16_fast_eval(expr, &regs);
+ else if (expr->ops == &nft_bitwise_fast_ops)
+ nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
expr_call_ops_eval(expr, &regs, pkt);
@@ -176,61 +291,58 @@ next_rule:
switch (regs.verdict.code) {
case NFT_BREAK:
regs.verdict.code = NFT_CONTINUE;
+ nft_trace_copy_nftrace(pkt, &info);
continue;
case NFT_CONTINUE:
- nft_trace_packet(&info, chain, rule,
+ nft_trace_packet(pkt, &regs.verdict, &info, rule,
NFT_TRACETYPE_RULE);
continue;
}
break;
}
+ nft_trace_verdict(pkt, &info, rule, &regs);
+
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
- case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
return regs.verdict.code;
+ case NF_DROP:
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
}
switch (regs.verdict.code) {
case NFT_JUMP:
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
- jumpstack[stackptr].chain = chain;
- jumpstack[stackptr].rules = rules + 1;
+ jumpstack[stackptr].rule = nft_rule_next(rule);
stackptr++;
- /* fall through */
+ fallthrough;
case NFT_GOTO:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RULE);
-
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE:
- /* fall through */
case NFT_RETURN:
- nft_trace_packet(&info, chain, rule,
- NFT_TRACETYPE_RETURN);
break;
default:
- WARN_ON(1);
+ WARN_ON_ONCE(1);
}
if (stackptr > 0) {
stackptr--;
- chain = jumpstack[stackptr].chain;
- rules = jumpstack[stackptr].rules;
+ rule = jumpstack[stackptr].rule;
goto next_rule;
}
- nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);
+ nft_trace_packet(pkt, &regs.verdict, &info, NULL, NFT_TRACETYPE_POLICY);
if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);
+ if (nft_base_chain(basechain)->policy == NF_DROP)
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM);
+
return nft_base_chain(basechain)->policy;
}
EXPORT_SYMBOL_GPL(nft_do_chain);
@@ -247,18 +359,25 @@ static struct nft_expr_type *nft_basic_types[] = {
&nft_meta_type,
&nft_rt_type,
&nft_exthdr_type,
+ &nft_last_type,
+ &nft_counter_type,
+ &nft_objref_type,
+ &nft_inner_type,
};
static struct nft_object_type *nft_basic_objects[] = {
#ifdef CONFIG_NETWORK_SECMARK
&nft_secmark_obj_type,
#endif
+ &nft_counter_obj_type,
};
int __init nf_tables_core_module_init(void)
{
int err, i, j = 0;
+ nft_counter_init_seqcount();
+
for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) {
err = nft_register_obj(nft_basic_objects[i]);
if (err)
@@ -271,6 +390,8 @@ int __init nf_tables_core_module_init(void)
goto err;
}
+ nf_skip_indirect_calls_enable();
+
return 0;
err:
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
new file mode 100644
index 000000000000..fd30e205de84
--- /dev/null
+++ b/net/netfilter/nf_tables_offload.c
@@ -0,0 +1,699 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <net/pkt_cls.h>
+
+static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
+{
+ struct nft_flow_rule *flow;
+
+ flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL);
+ if (!flow)
+ return NULL;
+
+ flow->rule = flow_rule_alloc(num_actions);
+ if (!flow->rule) {
+ kfree(flow);
+ return NULL;
+ }
+
+ flow->rule->match.dissector = &flow->match.dissector;
+ flow->rule->match.mask = &flow->match.mask;
+ flow->rule->match.key = &flow->match.key;
+
+ return flow;
+}
+
+void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
+ enum flow_dissector_key_id addr_type)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_flow_key *mask = &match->mask;
+ struct nft_flow_key *key = &match->key;
+
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))
+ return;
+
+ key->control.addr_type = addr_type;
+ mask->control.addr_type = 0xffff;
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
+ offsetof(struct nft_flow_key, control);
+}
+
+struct nft_offload_ethertype {
+ __be16 value;
+ __be16 mask;
+};
+
+static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow)
+{
+ struct nft_flow_match *match = &flow->match;
+ struct nft_offload_ethertype ethertype = {
+ .value = match->key.basic.n_proto,
+ .mask = match->mask.basic.n_proto,
+ };
+
+ if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) &&
+ (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) ||
+ match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.cvlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid;
+ match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid;
+ match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] =
+ offsetof(struct nft_flow_key, cvlan);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN);
+ } else if (match->dissector.used_keys &
+ BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) &&
+ (match->key.basic.n_proto == htons(ETH_P_8021Q) ||
+ match->key.basic.n_proto == htons(ETH_P_8021AD))) {
+ match->key.basic.n_proto = match->key.vlan.vlan_tpid;
+ match->mask.basic.n_proto = match->mask.vlan.vlan_tpid;
+ match->key.vlan.vlan_tpid = ethertype.value;
+ match->mask.vlan.vlan_tpid = ethertype.mask;
+ match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
+ offsetof(struct nft_flow_key, vlan);
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
+ }
+}
+
+struct nft_flow_rule *nft_flow_rule_create(struct net *net,
+ const struct nft_rule *rule)
+{
+ struct nft_offload_ctx *ctx;
+ struct nft_flow_rule *flow;
+ int num_actions = 0, err;
+ struct nft_expr *expr;
+
+ expr = nft_expr_first(rule);
+ while (nft_expr_more(rule, expr)) {
+ if (expr->ops->offload_action &&
+ expr->ops->offload_action(expr))
+ num_actions++;
+
+ expr = nft_expr_next(expr);
+ }
+
+ if (num_actions == 0)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ flow = nft_flow_rule_alloc(num_actions);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ expr = nft_expr_first(rule);
+
+ ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ ctx->net = net;
+ ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
+
+ while (nft_expr_more(rule, expr)) {
+ if (!expr->ops->offload) {
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ err = expr->ops->offload(ctx, flow, expr);
+ if (err < 0)
+ goto err_out;
+
+ expr = nft_expr_next(expr);
+ }
+ nft_flow_rule_transfer_vlan(ctx, flow);
+
+ flow->proto = ctx->dep.l3num;
+ kfree(ctx);
+
+ return flow;
+err_out:
+ kfree(ctx);
+ nft_flow_rule_destroy(flow);
+
+ return ERR_PTR(err);
+}
+
+void nft_flow_rule_destroy(struct nft_flow_rule *flow)
+{
+ struct flow_action_entry *entry;
+ int i;
+
+ flow_action_for_each(i, entry, &flow->rule->action) {
+ switch (entry->id) {
+ case FLOW_ACTION_REDIRECT:
+ case FLOW_ACTION_MIRRED:
+ dev_put(entry->dev);
+ break;
+ default:
+ break;
+ }
+ }
+ kfree(flow->rule);
+ kfree(flow);
+}
+
+void nft_offload_set_dependency(struct nft_offload_ctx *ctx,
+ enum nft_offload_dep_type type)
+{
+ ctx->dep.type = type;
+}
+
+void nft_offload_update_dependency(struct nft_offload_ctx *ctx,
+ const void *data, u32 len)
+{
+ switch (ctx->dep.type) {
+ case NFT_OFFLOAD_DEP_NETWORK:
+ WARN_ON(len != sizeof(__u16));
+ memcpy(&ctx->dep.l3num, data, sizeof(__u16));
+ break;
+ case NFT_OFFLOAD_DEP_TRANSPORT:
+ WARN_ON(len != sizeof(__u8));
+ memcpy(&ctx->dep.protonum, data, sizeof(__u8));
+ break;
+ default:
+ break;
+ }
+ ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
+}
+
+static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
+ __be16 proto, int priority,
+ struct netlink_ext_ack *extack)
+{
+ common->protocol = proto;
+ common->prio = priority;
+ common->extack = extack;
+}
+
+static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
+ struct list_head *cb_list)
+{
+ struct flow_block_cb *block_cb;
+ int err;
+
+ list_for_each_entry(block_cb, cb_list, list) {
+ err = block_cb->cb(type, type_data, block_cb->cb_priv);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
+{
+ if (basechain->ops.priority <= 0 ||
+ basechain->ops.priority > USHRT_MAX)
+ return -1;
+
+ return 0;
+}
+
+bool nft_chain_offload_support(const struct nft_base_chain *basechain)
+{
+ struct nf_hook_ops *ops;
+ struct net_device *dev;
+ struct nft_hook *hook;
+
+ if (nft_chain_offload_priority(basechain) < 0)
+ return false;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (ops->pf != NFPROTO_NETDEV ||
+ ops->hooknum != NF_NETDEV_INGRESS)
+ return false;
+
+ dev = ops->dev;
+ if (!dev->netdev_ops->ndo_setup_tc &&
+ !flow_indr_dev_exists())
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
+ const struct nft_base_chain *basechain,
+ const struct nft_rule *rule,
+ const struct nft_flow_rule *flow,
+ struct netlink_ext_ack *extack,
+ enum flow_cls_command command)
+{
+ __be16 proto = ETH_P_ALL;
+
+ memset(cls_flow, 0, sizeof(*cls_flow));
+
+ if (flow)
+ proto = flow->proto;
+
+ nft_flow_offload_common_init(&cls_flow->common, proto,
+ basechain->ops.priority, extack);
+ cls_flow->command = command;
+ cls_flow->cookie = (unsigned long) rule;
+ if (flow)
+ cls_flow->rule = flow->rule;
+}
+
+static int nft_flow_offload_cmd(const struct nft_chain *chain,
+ const struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command,
+ struct flow_cls_offload *cls_flow)
+{
+ struct netlink_ext_ack extack = {};
+ struct nft_base_chain *basechain;
+
+ if (!nft_is_base_chain(chain))
+ return -EOPNOTSUPP;
+
+ basechain = nft_base_chain(chain);
+ nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack,
+ command);
+
+ return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow,
+ &basechain->flow_block.cb_list);
+}
+
+static int nft_flow_offload_rule(const struct nft_chain *chain,
+ struct nft_rule *rule,
+ struct nft_flow_rule *flow,
+ enum flow_cls_command command)
+{
+ struct flow_cls_offload cls_flow;
+
+ return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow);
+}
+
+int nft_flow_rule_stats(const struct nft_chain *chain,
+ const struct nft_rule *rule)
+{
+ struct flow_cls_offload cls_flow = {};
+ struct nft_expr *expr, *next;
+ int err;
+
+ err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS,
+ &cls_flow);
+ if (err < 0)
+ return err;
+
+ nft_rule_for_each_expr(expr, next, rule) {
+ if (expr->ops->offload_stats)
+ expr->ops->offload_stats(expr, &cls_flow.stats);
+ }
+
+ return 0;
+}
+
+static int nft_flow_offload_bind(struct flow_block_offload *bo,
+ struct nft_base_chain *basechain)
+{
+ list_splice(&bo->cb_list, &basechain->flow_block.cb_list);
+ return 0;
+}
+
+static int nft_flow_offload_unbind(struct flow_block_offload *bo,
+ struct nft_base_chain *basechain)
+{
+ struct flow_block_cb *block_cb, *next;
+ struct flow_cls_offload cls_flow;
+ struct netlink_ext_ack extack;
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
+ chain = &basechain->chain;
+ list_for_each_entry(rule, &chain->rules, list) {
+ memset(&extack, 0, sizeof(extack));
+ nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL,
+ &extack, FLOW_CLS_DESTROY);
+ nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list);
+ }
+
+ list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+ list_del(&block_cb->list);
+ flow_block_cb_free(block_cb);
+ }
+
+ return 0;
+}
+
+static int nft_block_setup(struct nft_base_chain *basechain,
+ struct flow_block_offload *bo,
+ enum flow_block_command cmd)
+{
+ int err;
+
+ switch (cmd) {
+ case FLOW_BLOCK_BIND:
+ err = nft_flow_offload_bind(bo, basechain);
+ break;
+ case FLOW_BLOCK_UNBIND:
+ err = nft_flow_offload_unbind(bo, basechain);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static void nft_flow_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nft_base_chain *basechain,
+ struct netlink_ext_ack *extack)
+{
+ memset(bo, 0, sizeof(*bo));
+ bo->net = net;
+ bo->block = &basechain->flow_block;
+ bo->command = cmd;
+ bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+ bo->extack = extack;
+ bo->cb_list_head = &basechain->flow_block.cb_list;
+ INIT_LIST_HEAD(&bo->cb_list);
+}
+
+static int nft_block_offload_cmd(struct nft_base_chain *chain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+ int err;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+ if (err < 0)
+ return err;
+
+ return nft_block_setup(chain, &bo, cmd);
+}
+
+static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
+{
+ struct nft_base_chain *basechain = block_cb->indr.data;
+ struct net_device *dev = block_cb->indr.dev;
+ struct netlink_ext_ack extack = {};
+ struct nftables_pernet *nft_net;
+ struct net *net = dev_net(dev);
+ struct flow_block_offload bo;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND,
+ basechain, &extack);
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ list_del(&block_cb->driver_list);
+ list_move(&block_cb->list, &bo.cb_list);
+ nft_flow_offload_unbind(&bo, basechain);
+ mutex_unlock(&nft_net->commit_mutex);
+}
+
+static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
+ int err;
+
+ nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack);
+
+ err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo,
+ nft_indr_block_cleanup);
+ if (err < 0)
+ return err;
+
+ if (list_empty(&bo.cb_list))
+ return -EOPNOTSUPP;
+
+ return nft_block_setup(basechain, &bo, cmd);
+}
+
+static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
+ struct net_device *dev,
+ enum flow_block_command cmd)
+{
+ int err;
+
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nft_block_offload_cmd(basechain, dev, cmd);
+ else
+ err = nft_indr_block_offload_cmd(basechain, dev, cmd);
+
+ return err;
+}
+
+static int nft_flow_block_chain(struct nft_base_chain *basechain,
+ const struct net_device *this_dev,
+ enum flow_block_command cmd)
+{
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ int err, i = 0;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (this_dev && this_dev != ops->dev)
+ continue;
+
+ err = nft_chain_offload_cmd(basechain, ops->dev, cmd);
+ if (err < 0 && cmd == FLOW_BLOCK_BIND) {
+ if (!this_dev)
+ goto err_flow_block;
+
+ return err;
+ }
+ i++;
+ }
+ }
+
+ return 0;
+
+err_flow_block:
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ list_for_each_entry(ops, &hook->ops_list, list) {
+ if (i-- <= 0)
+ break;
+
+ nft_chain_offload_cmd(basechain, ops->dev,
+ FLOW_BLOCK_UNBIND);
+ }
+ }
+ return err;
+}
+
+static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
+ enum flow_block_command cmd)
+{
+ struct nft_base_chain *basechain;
+ u8 policy;
+
+ if (!nft_is_base_chain(chain))
+ return -EOPNOTSUPP;
+
+ basechain = nft_base_chain(chain);
+ policy = ppolicy ? *ppolicy : basechain->policy;
+
+ /* Only default policy to accept is supported for now. */
+ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP)
+ return -EOPNOTSUPP;
+
+ return nft_flow_block_chain(basechain, NULL, cmd);
+}
+
+static void nft_flow_rule_offload_abort(struct net *net,
+ struct nft_trans *trans)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ int err = 0;
+
+ list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) {
+ if (trans->table->family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ nft_trans_chain_update(trans))
+ continue;
+
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
+ FLOW_BLOCK_UNBIND);
+ break;
+ case NFT_MSG_DELCHAIN:
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
+ FLOW_BLOCK_BIND);
+ break;
+ case NFT_MSG_NEWRULE:
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
+ nft_trans_rule(trans),
+ NULL, FLOW_CLS_DESTROY);
+ break;
+ case NFT_MSG_DELRULE:
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
+ nft_trans_rule(trans),
+ nft_trans_flow_rule(trans),
+ FLOW_CLS_REPLACE);
+ break;
+ }
+
+ if (WARN_ON_ONCE(err))
+ break;
+ }
+}
+
+int nft_flow_rule_offload_commit(struct net *net)
+{
+ struct nftables_pernet *nft_net = nft_pernet(net);
+ struct nft_trans *trans;
+ int err = 0;
+ u8 policy;
+
+ list_for_each_entry(trans, &nft_net->commit_list, list) {
+ if (trans->table->family != NFPROTO_NETDEV)
+ continue;
+
+ switch (trans->msg_type) {
+ case NFT_MSG_NEWCHAIN:
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
+ nft_trans_chain_update(trans))
+ continue;
+
+ policy = nft_trans_chain_policy(trans);
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
+ FLOW_BLOCK_BIND);
+ break;
+ case NFT_MSG_DELCHAIN:
+ if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ policy = nft_trans_chain_policy(trans);
+ err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
+ FLOW_BLOCK_UNBIND);
+ break;
+ case NFT_MSG_NEWRULE:
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ if (trans->flags & NLM_F_REPLACE ||
+ !(trans->flags & NLM_F_APPEND)) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
+ nft_trans_rule(trans),
+ nft_trans_flow_rule(trans),
+ FLOW_CLS_REPLACE);
+ break;
+ case NFT_MSG_DELRULE:
+ if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
+ nft_trans_rule(trans),
+ NULL, FLOW_CLS_DESTROY);
+ break;
+ }
+
+ if (err) {
+ nft_flow_rule_offload_abort(net, trans);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net,
+ struct net_device *dev)
+{
+ struct nft_base_chain *basechain;
+ struct nft_hook *hook, *found;
+ const struct nft_table *table;
+ struct nft_chain *chain;
+
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (table->family != NFPROTO_NETDEV)
+ continue;
+
+ list_for_each_entry(chain, &table->chains, list) {
+ if (!nft_is_base_chain(chain) ||
+ !(chain->flags & NFT_CHAIN_HW_OFFLOAD))
+ continue;
+
+ found = NULL;
+ basechain = nft_base_chain(chain);
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ if (!nft_hook_find_ops(hook, dev))
+ continue;
+
+ found = hook;
+ break;
+ }
+ if (!found)
+ continue;
+
+ return chain;
+ }
+ }
+
+ return NULL;
+}
+
+static int nft_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
+ struct net *net = dev_net(dev);
+ struct nft_chain *chain;
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(net);
+ mutex_lock(&nft_net->commit_mutex);
+ chain = __nft_offload_get_chain(nft_net, dev);
+ if (chain)
+ nft_flow_block_chain(nft_base_chain(chain), dev,
+ FLOW_BLOCK_UNBIND);
+
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nft_offload_netdev_notifier = {
+ .notifier_call = nft_offload_netdev_event,
+};
+
+int nft_offload_init(void)
+{
+ return register_netdevice_notifier(&nft_offload_netdev_notifier);
+}
+
+void nft_offload_exit(void)
+{
+ unregister_netdevice_notifier(&nft_offload_netdev_notifier);
+}
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
deleted file mode 100644
index 814789644bd3..000000000000
--- a/net/netfilter/nf_tables_set_core.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <net/netfilter/nf_tables_core.h>
-
-static int __init nf_tables_set_module_init(void)
-{
- nft_register_set(&nft_set_hash_fast_type);
- nft_register_set(&nft_set_hash_type);
- nft_register_set(&nft_set_rhash_type);
- nft_register_set(&nft_set_bitmap_type);
- nft_register_set(&nft_set_rbtree_type);
-
- return 0;
-}
-
-static void __exit nf_tables_set_module_exit(void)
-{
- nft_unregister_set(&nft_set_rbtree_type);
- nft_unregister_set(&nft_set_bitmap_type);
- nft_unregister_set(&nft_set_rhash_type);
- nft_unregister_set(&nft_set_hash_type);
- nft_unregister_set(&nft_set_hash_fast_type);
-}
-
-module_init(nf_tables_set_module_init);
-module_exit(nf_tables_set_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index e1dc527a493b..a88abae5a9de 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -1,16 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2015 Red Hat GmbH
* Author: Florian Westphal <fw@strlen.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/static_key.h>
#include <linux/hash.h>
-#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/skbuff.h>
@@ -18,6 +15,7 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
@@ -28,22 +26,6 @@
DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
EXPORT_SYMBOL_GPL(nft_trace_enabled);
-static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
-{
- __be32 id;
-
- /* using skb address as ID results in a limited number of
- * values (and quick reuse).
- *
- * So we attempt to use as many skb members that will not
- * change while skb is with netfilter.
- */
- id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
- skb->skb_iif);
-
- return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
-}
-
static int trace_fill_header(struct sk_buff *nlskb, u16 type,
const struct sk_buff *skb,
int off, unsigned int len)
@@ -109,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
return 0;
}
+static int nf_trace_fill_ct_info(struct sk_buff *nlskb,
+ const struct sk_buff *skb)
+{
+ const struct nf_ct_hook *ct_hook;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ u32 state;
+
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook)
+ return 0;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) {
+ if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */
+ return 0;
+
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ } else {
+ state = NF_CT_STATE_BIT(ctinfo);
+ }
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state)))
+ return -1;
+
+ if (ct) {
+ u32 id = ct_hook->get_id(&ct->ct_general);
+ u32 status = READ_ONCE(ct->status);
+ u8 dir = CTINFO2DIR(ctinfo);
+
+ if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir))
+ return -1;
+
+ if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id))
+ return -1;
+
+ /* Kernel implementation detail, withhold this from userspace for now */
+ status &= ~IPS_NAT_CLASH;
+
+ if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status)))
+ return -1;
+ }
+
+ return 0;
+}
+
static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
const struct nft_pktinfo *pkt)
{
@@ -116,17 +144,17 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
int off = skb_network_offset(skb);
unsigned int len, nh_end;
- nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len;
+ nh_end = pkt->flags & NFT_PKTINFO_L4PROTO ? nft_thoff(pkt) : skb->len;
len = min_t(unsigned int, nh_end - skb_network_offset(skb),
NFT_TRACETYPE_NETWORK_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
return -1;
- if (pkt->tprot_set) {
- len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ if (pkt->flags & NFT_PKTINFO_L4PROTO) {
+ len = min_t(unsigned int, skb->len - nft_thoff(pkt),
NFT_TRACETYPE_TRANSPORT_HSIZE);
if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
- pkt->xt.thoff, len))
+ nft_thoff(pkt), len))
return -1;
}
@@ -143,9 +171,11 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
}
static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
const struct nft_traceinfo *info)
{
- if (!info->rule)
+ if (!rule || rule->is_last)
return 0;
/* a continue verdict with ->type == RETURN means that this is
@@ -154,15 +184,16 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
* Since no rule matched, the ->rule pointer is invalid.
*/
if (info->type == NFT_TRACETYPE_RETURN &&
- info->verdict->code == NFT_CONTINUE)
+ verdict->code == NFT_CONTINUE)
return 0;
return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
- cpu_to_be64(info->rule->handle),
+ cpu_to_be64(rule->handle),
NFTA_TRACE_PAD);
}
-static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
+ struct nft_traceinfo *info)
{
switch (info->type) {
case NFT_TRACETYPE_RETURN:
@@ -172,7 +203,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return false;
}
- switch (info->verdict->code) {
+ switch (verdict->code) {
case NFT_JUMP:
case NFT_GOTO:
break;
@@ -183,26 +214,54 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
return true;
}
-void nft_trace_notify(struct nft_traceinfo *info)
+static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule,
+ const struct nft_traceinfo *info)
{
- const struct nft_pktinfo *pkt = info->pkt;
- struct nfgenmsg *nfmsg;
+ const struct nft_rule_dp_last *last;
+
+ if (!rule)
+ return &info->basechain->chain;
+
+ while (!rule->is_last)
+ rule = nft_rule_next(rule);
+
+ last = (const struct nft_rule_dp_last *)rule;
+
+ if (WARN_ON_ONCE(!last->chain))
+ return &info->basechain->chain;
+
+ return last->chain;
+}
+
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_rule_dp *rule,
+ struct nft_traceinfo *info)
+{
+ const struct nft_chain *chain;
struct nlmsghdr *nlh;
struct sk_buff *skb;
unsigned int size;
+ u32 mark = 0;
u16 event;
if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
return;
+ chain = nft_trace_get_chain(rule, info);
+
size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
- nla_total_size(strlen(info->chain->table->name)) +
- nla_total_size(strlen(info->chain->name)) +
+ nla_total_size(strlen(chain->table->name)) +
+ nla_total_size(strlen(chain->name)) +
nla_total_size_64bit(sizeof(__be64)) + /* rule handle */
nla_total_size(sizeof(__be32)) + /* trace type */
nla_total_size(0) + /* VERDICT, nested */
nla_total_size(sizeof(u32)) + /* verdict code */
- nla_total_size(sizeof(u32)) + /* id */
+ nla_total_size(sizeof(u32)) + /* ct id */
+ nla_total_size(sizeof(u8)) + /* ct direction */
+ nla_total_size(sizeof(u32)) + /* ct state */
+ nla_total_size(sizeof(u32)) + /* ct status */
+ nla_total_size(sizeof(u32)) + /* trace id */
nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
@@ -214,39 +273,35 @@ void nft_trace_notify(struct nft_traceinfo *info)
nla_total_size(sizeof(u32)) + /* nfproto */
nla_total_size(sizeof(u32)); /* policy */
- if (nft_trace_have_verdict_chain(info))
- size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+ if (nft_trace_have_verdict_chain(verdict, info))
+ size += nla_total_size(strlen(verdict->chain->name)); /* jump target */
skb = nlmsg_new(size, GFP_ATOMIC);
if (!skb)
return;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE);
- nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family,
+ NFNETLINK_V0, 0);
if (!nlh)
goto nla_put_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = info->basechain->type->family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt))))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
goto nla_put_failure;
- if (trace_fill_id(skb, pkt->skb))
+ if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
+ if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name))
goto nla_put_failure;
- if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name))
+ if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
goto nla_put_failure;
- if (nf_trace_fill_rule_info(skb, info))
+ if (nf_trace_fill_rule_info(skb, verdict, rule, info))
goto nla_put_failure;
switch (info->type) {
@@ -254,19 +309,31 @@ void nft_trace_notify(struct nft_traceinfo *info)
case __NFT_TRACETYPE_MAX:
break;
case NFT_TRACETYPE_RETURN:
- case NFT_TRACETYPE_RULE:
- if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+ case NFT_TRACETYPE_RULE: {
+ unsigned int v;
+
+ if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict))
goto nla_put_failure;
+
+ /* pkt->skb undefined iff NF_STOLEN, disable dump */
+ v = verdict->code & NF_VERDICT_MASK;
+ if (v == NF_STOLEN)
+ info->packet_dumped = true;
+ else
+ mark = pkt->skb->mark;
+
break;
+ }
case NFT_TRACETYPE_POLICY:
+ mark = pkt->skb->mark;
+
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
- if (pkt->skb->mark &&
- nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark)))
goto nla_put_failure;
if (!info->packet_dumped) {
@@ -275,6 +342,10 @@ void nft_trace_notify(struct nft_traceinfo *info)
if (nf_trace_fill_pkt_info(skb, pkt))
goto nla_put_failure;
+
+ if (nf_trace_fill_ct_info(skb, pkt->skb))
+ goto nla_put_failure;
+
info->packet_dumped = true;
}
@@ -288,12 +359,20 @@ void nft_trace_notify(struct nft_traceinfo *info)
}
void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
- const struct nft_verdict *verdict,
const struct nft_chain *chain)
{
+ static siphash_key_t trace_key __read_mostly;
+ struct sk_buff *skb = pkt->skb;
+
info->basechain = nft_base_chain(chain);
info->trace = true;
+ info->nf_trace = pkt->skb->nf_trace;
info->packet_dumped = false;
- info->pkt = pkt;
- info->verdict = verdict;
+
+ net_get_random_once(&trace_key, sizeof(trace_key));
+
+ info->skbid = (u32)siphash_3u32(hash32_ptr(skb),
+ skb_get_hash_net(nft_net(pkt), skb),
+ skb->skb_iif,
+ &trace_key);
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 916913454624..811d02b4c4f7 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -28,11 +28,14 @@
#include <linux/sched/signal.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
+#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+MODULE_DESCRIPTION("Netfilter messages via netlink socket");
#define nfnl_dereference_protected(id) \
rcu_dereference_protected(table[(id)].subsys, \
@@ -40,11 +43,39 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
#define NFNL_MAX_ATTR_COUNT 32
+static unsigned int nfnetlink_pernet_id __read_mostly;
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static DEFINE_SPINLOCK(nfnl_grp_active_lock);
+#endif
+
+struct nfnl_net {
+ struct sock *nfnl;
+};
+
static struct {
struct mutex mutex;
const struct nfnetlink_subsystem __rcu *subsys;
} table[NFNL_SUBSYS_COUNT];
+static struct lock_class_key nfnl_lockdep_keys[NFNL_SUBSYS_COUNT];
+
+static const char *const nfnl_lockdep_names[NFNL_SUBSYS_COUNT] = {
+ [NFNL_SUBSYS_NONE] = "nfnl_subsys_none",
+ [NFNL_SUBSYS_CTNETLINK] = "nfnl_subsys_ctnetlink",
+ [NFNL_SUBSYS_CTNETLINK_EXP] = "nfnl_subsys_ctnetlink_exp",
+ [NFNL_SUBSYS_QUEUE] = "nfnl_subsys_queue",
+ [NFNL_SUBSYS_ULOG] = "nfnl_subsys_ulog",
+ [NFNL_SUBSYS_OSF] = "nfnl_subsys_osf",
+ [NFNL_SUBSYS_IPSET] = "nfnl_subsys_ipset",
+ [NFNL_SUBSYS_ACCT] = "nfnl_subsys_acct",
+ [NFNL_SUBSYS_CTNETLINK_TIMEOUT] = "nfnl_subsys_cttimeout",
+ [NFNL_SUBSYS_CTHELPER] = "nfnl_subsys_cthelper",
+ [NFNL_SUBSYS_NFTABLES] = "nfnl_subsys_nftables",
+ [NFNL_SUBSYS_NFT_COMPAT] = "nfnl_subsys_nftcompat",
+ [NFNL_SUBSYS_HOOK] = "nfnl_subsys_hook",
+};
+
static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_NEW] = NFNL_SUBSYS_CTNETLINK,
[NFNLGRP_CONNTRACK_UPDATE] = NFNL_SUBSYS_CTNETLINK,
@@ -57,6 +88,11 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
};
+static struct nfnl_net *nfnl_pernet(struct net *net)
+{
+ return net_generic(net, nfnetlink_pernet_id);
+}
+
void nfnl_lock(__u8 subsys_id)
{
mutex_lock(&table[subsys_id].mutex);
@@ -131,30 +167,51 @@ nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
int nfnetlink_has_listeners(struct net *net, unsigned int group)
{
- return netlink_has_listeners(net->nfnl, group);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_has_listeners(nfnlnet->nfnl, group);
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
- return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- return netlink_set_err(net->nfnl, portid, group, error);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ return netlink_set_err(nfnlnet->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
- int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid)
{
- return netlink_unicast(net->nfnl, skb, portid, flags);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+ int err;
+
+ err = nlmsg_unicast(nfnlnet->nfnl, skb, portid);
+ if (err == -EAGAIN)
+ err = -ENOBUFS;
+
+ return err;
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
+void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid,
+ __u32 group, gfp_t allocation)
+{
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_broadcast);
+
/* Process one complete nfnetlink message. */
static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
@@ -171,6 +228,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
type = nlh->nlmsg_type;
replay:
rcu_read_lock();
+
ss = nfnetlink_get_subsys(type);
if (!ss) {
#ifdef CONFIG_MODULES
@@ -194,11 +252,19 @@ replay:
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
int attrlen = nlh->nlmsg_len - min_len;
__u8 subsys_id = NFNL_SUBSYS_ID(type);
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = extack,
+ };
/* Sanity-check NFNL_MAX_ATTR_COUNT */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -206,31 +272,40 @@ replay:
return -ENOMEM;
}
- err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen,
- ss->cb[cb_id].policy, extack);
+ err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count,
+ attr, attrlen,
+ ss->cb[cb_id].policy, extack);
if (err < 0) {
rcu_read_unlock();
return err;
}
- if (nc->call_rcu) {
- err = nc->call_rcu(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
+ if (!nc->call) {
rcu_read_unlock();
- } else {
+ return -EINVAL;
+ }
+
+ switch (nc->type) {
+ case NFNL_CB_RCU:
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
+ rcu_read_unlock();
+ break;
+ case NFNL_CB_MUTEX:
rcu_read_unlock();
nfnl_lock(subsys_id);
if (nfnl_dereference_protected(subsys_id) != ss ||
- nfnetlink_find_client(type, ss) != nc)
+ nfnetlink_find_client(type, ss) != nc) {
+ nfnl_unlock(subsys_id);
err = -EAGAIN;
- else if (nc->call)
- err = nc->call(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- extack);
- else
- err = -EINVAL;
+ break;
+ }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
nfnl_unlock(subsys_id);
+ break;
+ default:
+ rcu_read_unlock();
+ err = -EINVAL;
+ break;
}
if (err == -EAGAIN)
goto replay;
@@ -301,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
struct netlink_ext_ack extack;
+ struct nlmsghdr *onlh = nlh;
LIST_HEAD(err_list);
u32 status;
int err;
@@ -309,8 +385,9 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
return netlink_ack(skb, nlh, -EINVAL, NULL);
replay:
status = 0;
-
+replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ nlh = onlh;
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
@@ -327,31 +404,36 @@ replay:
{
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
}
if (!ss->valid_genid || !ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!try_module_get(ss->owner)) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
if (!ss->valid_genid(net, genid)) {
module_put(ss->owner);
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
- return kfree_skb(skb);
+ return consume_skb(skb);
}
nfnl_unlock(subsys_id);
+ if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
+ }
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -408,12 +490,25 @@ replay:
goto ack;
}
+ if (nc->type != NFNL_CB_BATCH) {
+ err = -EINVAL;
+ goto ack;
+ }
+
{
int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
- u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1];
struct nlattr *attr = (void *)nlh + min_len;
+ u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
int attrlen = nlh->nlmsg_len - min_len;
+ struct nfnl_info info = {
+ .net = net,
+ .sk = nfnlnet->nfnl,
+ .nlh = nlh,
+ .nfmsg = nlmsg_data(nlh),
+ .extack = &extack,
+ };
/* Sanity-check NFTA_MAX_ATTR */
if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) {
@@ -421,16 +516,14 @@ replay:
goto ack;
}
- err = nla_parse(cda, ss->cb[cb_id].attr_count, attr,
- attrlen, ss->cb[cb_id].policy, NULL);
+ err = nla_parse_deprecated(cda,
+ ss->cb[cb_id].attr_count,
+ attr, attrlen,
+ ss->cb[cb_id].policy, &extack);
if (err < 0)
goto ack;
- if (nc->call_batch) {
- err = nc->call_batch(net, net->nfnl, skb, nlh,
- (const struct nlattr **)cda,
- &extack);
- }
+ err = nc->call(skb, &info, (const struct nlattr **)cda);
/* The lock was released to autoload some module, we
* have to abort and start from scratch using the
@@ -447,7 +540,8 @@ ack:
* processed, this avoids that the same error is
* reported several times when replaying the batch.
*/
- if (nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
+ if (err == -ENOMEM ||
+ nfnl_err_add(&err_list, nlh, err, &extack) < 0) {
/* We failed to enqueue an error, reset the
* list of errors and send OOM to userspace
* pointing to the batch header.
@@ -473,9 +567,9 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(net, oskb);
+ ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD);
nfnl_err_reset(&err_list);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
@@ -484,17 +578,32 @@ done:
status |= NFNL_BATCH_REPLAY;
goto done;
} else if (err) {
- ss->abort(net, oskb);
+ ss->abort(net, oskb, NFNL_ABORT_NONE);
netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
+ } else if (nlh->nlmsg_flags & NLM_F_ACK) {
+ memset(&extack, 0, sizeof(extack));
+ nfnl_err_add(&err_list, nlh, 0, &extack);
}
} else {
- ss->abort(net, oskb);
+ enum nfnl_abort_action abort_action;
+
+ if (status & NFNL_BATCH_FAILURE)
+ abort_action = NFNL_ABORT_NONE;
+ else
+ abort_action = NFNL_ABORT_VALIDATE;
+
+ err = ss->abort(net, oskb, abort_action);
+ if (err == -EAGAIN) {
+ nfnl_err_reset(&err_list);
+ consume_skb(skb);
+ module_put(ss->owner);
+ status |= NFNL_BATCH_FAILURE;
+ goto replay_abort;
+ }
}
- if (ss->cleanup)
- ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
- kfree_skb(skb);
+ consume_skb(skb);
module_put(ss->owner);
}
@@ -520,8 +629,8 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg))
return;
- err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy,
- NULL);
+ err = nla_parse_deprecated(cda, NFNL_BATCH_MAX, attr, attrlen,
+ nfnl_batch_policy, NULL);
if (err < 0) {
netlink_ack(skb, nlh, err, NULL);
return;
@@ -532,7 +641,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
nfgenmsg = nlmsg_data(nlh);
skb_pull(skb, msglen);
/* Work around old nft using host byte order */
- if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+ if (nfgenmsg->res_id == (__force __be16)NFNL_SUBSYS_NFTABLES)
res_id = NFNL_SUBSYS_NFTABLES;
else
res_id = ntohs(nfgenmsg->res_id);
@@ -560,7 +669,44 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
-#ifdef CONFIG_MODULES
+static void nfnetlink_bind_event(struct net *net, unsigned int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+ u8 v;
+
+ /* All NFNLGRP_CONNTRACK_* group bits fit into u8.
+ * The other groups are not relevant and can be ignored.
+ */
+ if (group >= 8)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ v = READ_ONCE(nf_ctnetlink_has_listener);
+ if ((v & group_bit) == 0) {
+ v |= group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
+ }
+
+ spin_unlock(&nfnl_grp_active_lock);
+#endif
+}
+
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
@@ -575,44 +721,83 @@ static int nfnetlink_bind(struct net *net, int group)
ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
- request_module("nfnetlink-subsys-%d", type);
+ request_module_nowait("nfnetlink-subsys-%d", type);
+
+ nfnetlink_bind_event(net, group);
return 0;
}
+
+static void nfnetlink_unbind(struct net *net, int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type, group_bit;
+
+ if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+ return;
+
+ type = nfnl_group2type[group];
+
+ switch (type) {
+ case NFNL_SUBSYS_CTNETLINK:
+ break;
+ case NFNL_SUBSYS_CTNETLINK_EXP:
+ break;
+ default:
+ return;
+ }
+
+ /* ctnetlink_has_listener is u8 */
+ if (group >= 8)
+ return;
+
+ group_bit = (1 << group);
+
+ spin_lock(&nfnl_grp_active_lock);
+ if (!nfnetlink_has_listeners(net, group)) {
+ u8 v = READ_ONCE(nf_ctnetlink_has_listener);
+
+ v &= ~group_bit;
+
+ /* read concurrently without nfnl_grp_active_lock held. */
+ WRITE_ONCE(nf_ctnetlink_has_listener, v);
+ }
+ spin_unlock(&nfnl_grp_active_lock);
#endif
+}
static int __net_init nfnetlink_net_init(struct net *net)
{
- struct sock *nfnl;
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
-#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
-#endif
+ .unbind = nfnetlink_unbind,
};
- nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
- if (!nfnl)
+ nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
+ if (!nfnlnet->nfnl)
return -ENOMEM;
- net->nfnl_stash = nfnl;
- rcu_assign_pointer(net->nfnl, nfnl);
return 0;
}
static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
{
+ struct nfnl_net *nfnlnet;
struct net *net;
- list_for_each_entry(net, net_exit_list, exit_list)
- RCU_INIT_POINTER(net->nfnl, NULL);
- synchronize_net();
- list_for_each_entry(net, net_exit_list, exit_list)
- netlink_kernel_release(net->nfnl_stash);
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nfnlnet = nfnl_pernet(net);
+
+ netlink_kernel_release(nfnlnet->nfnl);
+ }
}
static struct pernet_operations nfnetlink_net_ops = {
.init = nfnetlink_net_init,
.exit_batch = nfnetlink_net_exit_batch,
+ .id = &nfnetlink_pernet_id,
+ .size = sizeof(struct nfnl_net),
};
static int __init nfnetlink_init(void)
@@ -623,7 +808,7 @@ static int __init nfnetlink_init(void)
BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
for (i=0; i<NFNL_SUBSYS_COUNT; i++)
- mutex_init(&table[i].mutex);
+ __mutex_init(&table[i].mutex, nfnl_lockdep_names[i], &nfnl_lockdep_keys[i]);
return register_pernet_subsys(&nfnetlink_net_ops);
}
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 8fa8bf7c48e6..505f46a32173 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation (or any later at your option).
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#include <linux/init.h>
#include <linux/module.h>
@@ -19,6 +16,7 @@
#include <linux/errno.h>
#include <net/netlink.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
@@ -36,7 +34,7 @@ struct nf_acct {
refcount_t refcnt;
char name[NFACCT_NAME_MAX];
struct rcu_head rcu_head;
- char data[0];
+ char data[];
};
struct nfacct_filter {
@@ -44,17 +42,27 @@ struct nfacct_filter {
u32 mask;
};
+struct nfnl_acct_net {
+ struct list_head nfnl_acct_list;
+};
+
+static unsigned int nfnl_acct_net_id __read_mostly;
+
+static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_acct_net_id);
+}
+
#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
-static int nfnl_acct_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *nfacct, *matching = NULL;
- char *acct_name;
unsigned int size = 0;
+ char *acct_name;
u32 flags = 0;
if (!tb[NFACCT_NAME])
@@ -64,11 +72,11 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
if (strlen(acct_name) == 0)
return -EINVAL;
- list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
+ list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = nfacct;
@@ -76,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* reset counters if you request a replacement. */
atomic64_set(&matching->pkts, 0);
atomic64_set(&matching->bytes, 0);
@@ -115,7 +123,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
nfacct->flags = flags;
}
- nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
+ nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX);
if (tb[NFACCT_BYTES]) {
atomic64_set(&nfacct->bytes,
@@ -126,7 +134,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl,
be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
}
refcount_set(&nfacct->refcnt, 1);
- list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
+ list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list);
return 0;
}
@@ -135,21 +143,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_acct *acct)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
u64 pkts, bytes;
u32 old_flags;
event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFACCT_NAME, acct->name))
goto nla_put_failure;
@@ -191,6 +194,7 @@ static int
nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *last;
const struct nfacct_filter *filter = cb->data;
@@ -202,7 +206,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (last) {
if (cur != last)
continue;
@@ -248,8 +252,8 @@ static int nfnl_acct_start(struct netlink_callback *cb)
if (!attr)
return 0;
- err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy,
- NULL);
+ err = nla_parse_nested_deprecated(tb, NFACCT_FILTER_MAX, attr,
+ filter_policy, NULL);
if (err < 0)
return err;
@@ -267,16 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb)
return 0;
}
-static int nfnl_acct_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_acct_dump,
.start = nfnl_acct_start,
@@ -284,14 +287,14 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
.data = (void *)tb[NFACCT_FILTER],
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!tb[NFACCT_NAME])
return -EINVAL;
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -304,21 +307,18 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
- NFNL_MSG_ACCT_NEW, cur);
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
+ NFNL_MSG_ACCT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -340,24 +340,23 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
return ret;
}
-static int nfnl_acct_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net);
struct nf_acct *cur, *tmp;
int ret = -ENOENT;
char *acct_name;
if (!tb[NFACCT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head)
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head)
nfnl_acct_try_del(cur);
return 0;
}
acct_name = nla_data(tb[NFACCT_NAME]);
- list_for_each_entry(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
continue;
@@ -380,18 +379,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
};
static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
- [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
- [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del,
- .attr_count = NFACCT_MAX,
- .policy = nfnl_acct_policy },
+ [NFNL_MSG_ACCT_NEW] = {
+ .call = nfnl_acct_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_GET_CTRZERO] = {
+ .call = nfnl_acct_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
+ [NFNL_MSG_ACCT_DEL] = {
+ .call = nfnl_acct_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFACCT_MAX,
+ .policy = nfnl_acct_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_acct_subsys = {
@@ -405,10 +416,11 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *acct = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
+ list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) {
if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
continue;
@@ -460,8 +472,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
kfree_skb(skb);
return;
}
- netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA,
- GFP_ATOMIC);
+ nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC);
}
int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct)
@@ -491,16 +502,17 @@ EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
static int __net_init nfnl_acct_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfnl_acct_list);
+ INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list);
return 0;
}
static void __net_exit nfnl_acct_net_exit(struct net *net)
{
+ struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net);
struct nf_acct *cur, *tmp;
- list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+ list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) {
list_del_rcu(&cur->head);
if (refcount_dec_and_test(&cur->refcnt))
@@ -511,6 +523,8 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
static struct pernet_operations nfnl_acct_ops = {
.init = nfnl_acct_net_init,
.exit = nfnl_acct_net_exit,
+ .id = &nfnl_acct_net_id,
+ .size = sizeof(struct nfnl_acct_net),
};
static int __init nfnl_acct_init(void)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index e5d27b2e4eba..97248963a7d3 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation (or any later at your option).
- *
* This software has been sponsored by Vyatta Inc. <http://www.vyatta.com>
*/
#include <linux/init.h>
@@ -78,8 +75,8 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple,
int err;
struct nlattr *tb[NFCTH_TUPLE_MAX+1];
- err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr,
- nfnl_cthelper_tuple_pol, NULL);
+ err = nla_parse_nested_deprecated(tb, NFCTH_TUPLE_MAX, attr,
+ nfnl_cthelper_tuple_pol, NULL);
if (err < 0)
return err;
@@ -99,14 +96,16 @@ static int
nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
{
struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
if (attr == NULL)
return -EINVAL;
- if (help->helper->data_len == 0)
+ helper = rcu_dereference(help->helper);
+ if (!helper || helper->data_len == 0)
return -EINVAL;
- nla_memcpy(help->data, nla_data(attr), sizeof(help->data));
+ nla_memcpy(help->data, attr, sizeof(help->data));
return 0;
}
@@ -114,9 +113,11 @@ static int
nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
{
const struct nf_conn_help *help = nfct_help(ct);
+ const struct nf_conntrack_helper *helper;
- if (help->helper->data_len &&
- nla_put(skb, CTA_HELP_INFO, help->helper->data_len, &help->data))
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->data_len &&
+ nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
goto nla_put_failure;
return 0;
@@ -139,8 +140,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
int err;
struct nlattr *tb[NFCTH_POLICY_MAX+1];
- err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
- nfnl_cthelper_expect_pol, NULL);
+ err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr,
+ nfnl_cthelper_expect_pol, NULL);
if (err < 0)
return err;
@@ -149,7 +150,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
!tb[NFCTH_POLICY_EXPECT_TIMEOUT])
return -EINVAL;
- nla_strlcpy(expect_policy->name,
+ nla_strscpy(expect_policy->name,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
@@ -176,8 +177,9 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper,
struct nlattr *tb[NFCTH_POLICY_SET_MAX+1];
unsigned int class_max;
- ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
- nfnl_cthelper_expect_policy_set, NULL);
+ ret = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set,
+ NULL);
if (ret < 0)
return ret;
@@ -235,13 +237,14 @@ nfnl_cthelper_create(const struct nlattr * const tb[],
if (ret < 0)
goto err1;
- nla_strlcpy(helper->name,
+ nla_strscpy(helper->name,
tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN);
size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
- if (size > FIELD_SIZEOF(struct nf_conn_help, data)) {
+ if (size > sizeof_field(struct nf_conn_help, data)) {
ret = -ENOMEM;
goto err2;
}
+ helper->data_len = size;
helper->flags |= NF_CT_HELPER_F_USERSPACE;
memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple));
@@ -289,8 +292,8 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
struct nlattr *tb[NFCTH_POLICY_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr,
- nfnl_cthelper_expect_pol, NULL);
+ err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr,
+ nfnl_cthelper_expect_pol, NULL);
if (err < 0)
return err;
@@ -361,8 +364,9 @@ static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
unsigned int class_max;
int err;
- err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr,
- nfnl_cthelper_expect_policy_set, NULL);
+ err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr,
+ nfnl_cthelper_expect_policy_set,
+ NULL);
if (err < 0)
return err;
@@ -380,10 +384,14 @@ static int
nfnl_cthelper_update(const struct nlattr * const tb[],
struct nf_conntrack_helper *helper)
{
+ u32 size;
int ret;
- if (tb[NFCTH_PRIV_DATA_LEN])
- return -EBUSY;
+ if (tb[NFCTH_PRIV_DATA_LEN]) {
+ size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN]));
+ if (size != helper->data_len)
+ return -EBUSY;
+ }
if (tb[NFCTH_POLICY]) {
ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
@@ -408,10 +416,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return 0;
}
-static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
const char *helper_name;
struct nf_conntrack_helper *cur, *helper = NULL;
@@ -441,7 +447,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
tuple.dst.protonum != cur->tuple.dst.protonum))
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
helper = cur;
@@ -462,7 +468,7 @@ nfnl_cthelper_dump_tuple(struct sk_buff *skb,
{
struct nlattr *nest_parms;
- nest_parms = nla_nest_start(skb, NFCTH_TUPLE | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, NFCTH_TUPLE);
if (nest_parms == NULL)
goto nla_put_failure;
@@ -487,7 +493,7 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
int i;
struct nlattr *nest_parms1, *nest_parms2;
- nest_parms1 = nla_nest_start(skb, NFCTH_POLICY | NLA_F_NESTED);
+ nest_parms1 = nla_nest_start(skb, NFCTH_POLICY);
if (nest_parms1 == NULL)
goto nla_put_failure;
@@ -496,8 +502,7 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb,
goto nla_put_failure;
for (i = 0; i < helper->expect_class_max + 1; i++) {
- nest_parms2 = nla_nest_start(skb,
- (NFCTH_POLICY_SET+i) | NLA_F_NESTED);
+ nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET + i));
if (nest_parms2 == NULL)
goto nla_put_failure;
@@ -527,20 +532,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct nf_conntrack_helper *helper)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
int status;
event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFCTH_NAME, helper->name))
goto nla_put_failure;
@@ -613,10 +613,8 @@ out:
return skb->len;
}
-static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
int ret = -ENOENT;
struct nf_conntrack_helper *cur;
@@ -629,11 +627,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nfnl_cthelper_dump_table,
};
- return netlink_dump_start(nfnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (tb[NFCTH_NAME])
@@ -665,29 +663,23 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
}
ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_CTHELPER_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
-
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
-static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
char *helper_name = NULL;
struct nf_conntrack_helper *cur;
@@ -744,18 +736,29 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
[NFCTH_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN-1 },
[NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+ [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, },
+ [NFCTH_STATUS] = { .type = NLA_U32, },
};
static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
- [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
- [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del,
- .attr_count = NFCTH_MAX,
- .policy = nfnl_cthelper_policy },
+ [NFNL_MSG_CTHELPER_NEW] = {
+ .call = nfnl_cthelper_new,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_GET] = {
+ .call = nfnl_cthelper_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
+ [NFNL_MSG_CTHELPER_DEL] = {
+ .call = nfnl_cthelper_del,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFCTH_MAX,
+ .policy = nfnl_cthelper_policy
+ },
};
static const struct nfnetlink_subsystem nfnl_cthelper_subsys = {
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 109b0d27345a..38d75484e531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
* (C) 2012 by Vyatta Inc. <http://www.vyatta.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation (or any later at your option).
*/
#include <linux/init.h>
#include <linux/module.h>
@@ -23,6 +20,7 @@
#include <linux/netfilter.h>
#include <net/netlink.h>
+#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -33,6 +31,24 @@
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_cttimeout.h>
+static unsigned int nfct_timeout_id __read_mostly;
+
+struct ctnl_timeout {
+ struct list_head head;
+ struct list_head free_head;
+ struct rcu_head rcu_head;
+ refcount_t refcnt;
+ char name[CTNL_TIMEOUT_NAME_MAX];
+
+ /* must be at the end */
+ struct nf_ct_timeout timeout;
+};
+
+struct nfct_timeout_pernet {
+ struct list_head nfct_timeout_list;
+ struct list_head nfct_timeout_freelist;
+};
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
@@ -45,6 +61,11 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
[CTA_TIMEOUT_DATA] = { .type = NLA_NESTED },
};
+static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net)
+{
+ return net_generic(net, nfct_timeout_id);
+}
+
static int
ctnl_timeout_parse_policy(void *timeout,
const struct nf_conntrack_l4proto *l4proto,
@@ -59,8 +80,11 @@ ctnl_timeout_parse_policy(void *timeout,
if (!tb)
return -ENOMEM;
- ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, attr,
- l4proto->ctnl_timeout.nla_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb,
+ l4proto->ctnl_timeout.nlattr_max,
+ attr,
+ l4proto->ctnl_timeout.nla_policy,
+ NULL);
if (ret < 0)
goto err;
@@ -71,12 +95,11 @@ err:
return ret;
}
-static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_new_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
__u16 l3num;
__u8 l4num;
const struct nf_conntrack_l4proto *l4proto;
@@ -94,11 +117,11 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- list_for_each_entry(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
matching = timeout;
@@ -106,7 +129,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
}
if (matching) {
- if (nlh->nlmsg_flags & NLM_F_REPLACE) {
+ if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
/* You cannot replace one timeout policy by another of
* different kind, sorry.
*/
@@ -116,13 +139,14 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
return ctnl_timeout_parse_policy(&matching->timeout.data,
matching->timeout.l4proto,
- net, cda[CTA_TIMEOUT_DATA]);
+ info->net,
+ cda[CTA_TIMEOUT_DATA]);
}
return -EBUSY;
}
- l4proto = nf_ct_l4proto_find_get(l4num);
+ l4proto = nf_ct_l4proto_find(l4num);
/* This protocol is not supportted, skip. */
if (l4proto->l4proto != l4num) {
@@ -137,8 +161,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
goto err_proto_put;
}
- ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net,
- cda[CTA_TIMEOUT_DATA]);
+ ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto,
+ info->net, cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -146,13 +170,13 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
timeout->timeout.l3num = l3num;
timeout->timeout.l4proto = l4proto;
refcount_set(&timeout->refcnt, 1);
- list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
+ __module_get(THIS_MODULE);
+ list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list);
return 0;
err:
kfree(timeout);
err_proto_put:
- nf_ct_l4proto_put(l4proto);
return ret;
}
@@ -161,22 +185,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int event, struct ctnl_timeout *timeout)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) ||
nla_put_be16(skb, CTA_TIMEOUT_L3PROTO,
htons(timeout->timeout.l3num)) ||
@@ -185,7 +204,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
htonl(refcount_read(&timeout->refcnt))))
goto nla_put_failure;
- nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA);
if (!nest_parms)
goto nla_put_failure;
@@ -207,6 +226,7 @@ nla_put_failure:
static int
ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nfct_timeout_pernet *pernet;
struct net *net = sock_net(skb->sk);
struct ctnl_timeout *cur, *last;
@@ -218,7 +238,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) {
+ pernet = nfct_timeout_pernet(net);
+ list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) {
if (last) {
if (cur != last)
continue;
@@ -239,28 +260,27 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_get_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
int ret = -ENOENT;
char *name;
struct ctnl_timeout *cur;
- if (nlh->nlmsg_flags & NLM_F_DUMP) {
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnl_timeout_dump,
};
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
if (!cda[CTA_TIMEOUT_NAME])
return -EINVAL;
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
@@ -273,21 +293,18 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
}
ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_NEW, cur);
if (ret <= 0) {
kfree_skb(skb2);
break;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
+ break;
}
+
return ret;
}
@@ -302,7 +319,6 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
if (refcount_dec_if_one(&timeout->refcnt)) {
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
- nf_ct_l4proto_put(timeout->timeout.l4proto);
nf_ct_untimeout(net, &timeout->timeout);
kfree_rcu(timeout, rcu_head);
} else {
@@ -311,30 +327,29 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
return ret;
}
-static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_del_timeout(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net);
struct ctnl_timeout *cur, *tmp;
int ret = -ENOENT;
char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list,
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list,
head)
- ctnl_timeout_try_del(net, cur);
+ ctnl_timeout_try_del(info->net, cur);
return 0;
}
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &net->nfct_timeout_list, head) {
+ list_for_each_entry(cur, &pernet->nfct_timeout_list, head) {
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- ret = ctnl_timeout_try_del(net, cur);
+ ret = ctnl_timeout_try_del(info->net, cur);
if (ret < 0)
return ret;
@@ -343,23 +358,20 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
return ret;
}
-static int cttimeout_default_set(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_set(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
__u8 l4num;
int ret;
- if (!cda[CTA_TIMEOUT_L3PROTO] ||
- !cda[CTA_TIMEOUT_L4PROTO] ||
+ if (!cda[CTA_TIMEOUT_L4PROTO] ||
!cda[CTA_TIMEOUT_DATA])
return -EINVAL;
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- l4proto = nf_ct_l4proto_find_get(l4num);
+ l4proto = nf_ct_l4proto_find(l4num);
/* This protocol is not supported, skip. */
if (l4proto->l4proto != l4num) {
@@ -367,15 +379,13 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
goto err;
}
- ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
+ ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
- nf_ct_l4proto_put(l4proto);
return 0;
err:
- nf_ct_l4proto_put(l4proto);
return ret;
}
@@ -386,26 +396,21 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
const unsigned int *timeouts)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
struct nlattr *nest_parms;
int ret;
event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = AF_UNSPEC;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) ||
nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto))
goto nla_put_failure;
- nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED);
+ nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA);
if (!nest_parms)
goto nla_put_failure;
@@ -424,66 +429,53 @@ nla_put_failure:
return -1;
}
-static int cttimeout_default_get(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[],
- struct netlink_ext_ack *extack)
+static int cttimeout_default_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
const struct nf_conntrack_l4proto *l4proto;
unsigned int *timeouts = NULL;
struct sk_buff *skb2;
- int ret, err;
__u16 l3num;
__u8 l4num;
+ int ret;
if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
return -EINVAL;
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- l4proto = nf_ct_l4proto_find_get(l4num);
+ l4proto = nf_ct_l4proto_find(l4num);
- err = -EOPNOTSUPP;
if (l4proto->l4proto != l4num)
- goto err;
+ return -EOPNOTSUPP;
switch (l4proto->l4proto) {
case IPPROTO_ICMP:
- timeouts = &nf_icmp_pernet(net)->timeout;
+ timeouts = &nf_icmp_pernet(info->net)->timeout;
break;
case IPPROTO_TCP:
- timeouts = nf_tcp_pernet(net)->timeouts;
+ timeouts = nf_tcp_pernet(info->net)->timeouts;
break;
- case IPPROTO_UDP: /* fallthrough */
+ case IPPROTO_UDP:
case IPPROTO_UDPLITE:
- timeouts = nf_udp_pernet(net)->timeouts;
- break;
- case IPPROTO_DCCP:
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- timeouts = nf_dccp_pernet(net)->dccp_timeout;
-#endif
+ timeouts = nf_udp_pernet(info->net)->timeouts;
break;
case IPPROTO_ICMPV6:
- timeouts = &nf_icmpv6_pernet(net)->timeout;
+ timeouts = &nf_icmpv6_pernet(info->net)->timeout;
break;
case IPPROTO_SCTP:
#ifdef CONFIG_NF_CT_PROTO_SCTP
- timeouts = nf_sctp_pernet(net)->timeouts;
+ timeouts = nf_sctp_pernet(info->net)->timeouts;
#endif
break;
case IPPROTO_GRE:
#ifdef CONFIG_NF_CT_PROTO_GRE
- if (l4proto->net_id) {
- struct netns_proto_gre *net_gre;
-
- net_gre = net_generic(net, *l4proto->net_id);
- timeouts = net_gre->gre_timeouts;
- }
+ timeouts = nf_gre_pernet(info->net)->timeouts;
#endif
break;
case 255:
- timeouts = &nf_generic_pernet(net)->timeout;
+ timeouts = &nf_generic_pernet(info->net)->timeout;
break;
default:
WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto);
@@ -491,51 +483,38 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
}
if (!timeouts)
- goto err;
+ return -EOPNOTSUPP;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb2 == NULL) {
- err = -ENOMEM;
- goto err;
- }
+ if (!skb2)
+ return -ENOMEM;
- ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ ret = cttimeout_default_fill_info(info->net, skb2,
+ NETLINK_CB(skb).portid,
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
IPCTNL_MSG_TIMEOUT_DEFAULT_SET,
l3num, l4proto, timeouts);
if (ret <= 0) {
kfree_skb(skb2);
- err = -ENOMEM;
- goto err;
+ return -ENOMEM;
}
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
- /* this avoids a loop in nfnetlink. */
- return ret == -EAGAIN ? -ENOBUFS : ret;
-err:
- nf_ct_l4proto_put(l4proto);
- return err;
+ return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
const char *name)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *timeout, *matching = NULL;
- list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) {
+ list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- if (!try_module_get(THIS_MODULE))
- goto err;
-
- if (!refcount_inc_not_zero(&timeout->refcnt)) {
- module_put(THIS_MODULE);
+ if (!refcount_inc_not_zero(&timeout->refcnt))
goto err;
- }
matching = timeout;
break;
}
@@ -548,28 +527,43 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t)
struct ctnl_timeout *timeout =
container_of(t, struct ctnl_timeout, timeout);
- if (refcount_dec_and_test(&timeout->refcnt))
+ if (refcount_dec_and_test(&timeout->refcnt)) {
kfree_rcu(timeout, rcu_head);
-
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+ }
}
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
- [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
- [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get,
- .attr_count = CTA_TIMEOUT_MAX,
- .policy = cttimeout_nla_policy },
+ [IPCTNL_MSG_TIMEOUT_NEW] = {
+ .call = cttimeout_new_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_GET] = {
+ .call = cttimeout_get_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DELETE] = {
+ .call = cttimeout_del_timeout,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = {
+ .call = cttimeout_default_set,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
+ [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = {
+ .call = cttimeout_default_get,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = CTA_TIMEOUT_MAX,
+ .policy = cttimeout_nla_policy
+ },
};
static const struct nfnetlink_subsystem cttimeout_subsys = {
@@ -583,21 +577,39 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
static int __net_init cttimeout_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->nfct_timeout_list);
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+
+ INIT_LIST_HEAD(&pernet->nfct_timeout_list);
+ INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
+static void __net_exit cttimeout_net_pre_exit(struct net *net)
+{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+ struct ctnl_timeout *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
+ }
+
+ /* core calls synchronize_rcu() after this */
+}
+
static void __net_exit cttimeout_net_exit(struct net *net)
{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
- nf_ct_unconfirmed_destroy(net);
+ if (list_empty(&pernet->nfct_timeout_freelist))
+ return;
+
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
- list_del_rcu(&cur->head);
- nf_ct_l4proto_put(cur->timeout.l4proto);
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) {
+ list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -606,7 +618,15 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
+ .pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
+ .id = &nfct_timeout_id,
+ .size = sizeof(struct nfct_timeout_pernet),
+};
+
+static const struct nf_ct_timeout_hooks hooks = {
+ .timeout_find_get = ctnl_timeout_find_get,
+ .timeout_put = ctnl_timeout_put,
};
static int __init cttimeout_init(void)
@@ -623,8 +643,7 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
+ RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks);
return 0;
err_out:
@@ -632,14 +651,24 @@ err_out:
return ret;
}
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ return 0;
+}
+
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
- RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
- RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
- synchronize_rcu();
+ RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
+
+ nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
new file mode 100644
index 000000000000..92d869317cba
--- /dev/null
+++ b/net/netfilter/nfnetlink_hook.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021 Red Hat GmbH
+ *
+ * Author: Florian Westphal <fw@strlen.de>
+ */
+
+#include <linux/bpf.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/errno.h>
+#include <linux/netlink.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_hook.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <net/sock.h>
+
+static const struct nla_policy nfnl_hook_nla_policy[NFNLA_HOOK_MAX + 1] = {
+ [NFNLA_HOOK_HOOKNUM] = { .type = NLA_U32 },
+ [NFNLA_HOOK_PRIORITY] = { .type = NLA_U32 },
+ [NFNLA_HOOK_DEV] = { .type = NLA_STRING,
+ .len = IFNAMSIZ - 1 },
+ [NFNLA_HOOK_FUNCTION_NAME] = { .type = NLA_NUL_STRING,
+ .len = KSYM_NAME_LEN, },
+ [NFNLA_HOOK_MODULE_NAME] = { .type = NLA_NUL_STRING,
+ .len = MODULE_NAME_LEN, },
+ [NFNLA_HOOK_CHAIN_INFO] = { .type = NLA_NESTED, },
+};
+
+static int nf_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *c)
+{
+ int err;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ rcu_read_unlock();
+ err = netlink_dump_start(nlsk, skb, nlh, c);
+ rcu_read_lock();
+ module_put(THIS_MODULE);
+
+ return err;
+}
+
+struct nfnl_dump_hook_data {
+ char devname[IFNAMSIZ];
+ unsigned long headv;
+ u8 hook;
+};
+
+static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t)
+{
+ struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+ int ret;
+
+ if (!nest)
+ return NULL;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t));
+ if (ret == 0)
+ return nest;
+
+ nla_nest_cancel(nlskb, nest);
+ return NULL;
+}
+
+static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ const struct bpf_prog *prog)
+{
+ struct nlattr *nest, *nest2;
+ int ret;
+
+ if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK))
+ return 0;
+
+ if (WARN_ON_ONCE(!prog))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF);
+ if (!nest)
+ return -EMSGSIZE;
+
+ nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest2)
+ goto cancel_nest;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id));
+ if (ret)
+ goto cancel_nest;
+
+ nla_nest_end(nlskb, nest2);
+ nla_nest_end(nlskb, nest);
+ return 0;
+
+cancel_nest:
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+}
+
+static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname,
+ const char *name, u8 family)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+ if (!nest ||
+ nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) ||
+ nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) ||
+ nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) {
+ nla_nest_cancel(nlskb, nest);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nft_chain *chain)
+{
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!chain))
+ return 0;
+
+ if (!nft_is_active(net, chain))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name,
+ chain->name, chain->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ unsigned int seq,
+ struct nf_flowtable *nf_ft)
+{
+ struct nft_flowtable *ft =
+ container_of(nf_ft, struct nft_flowtable, data);
+ struct net *net = sock_net(nlskb->sk);
+ struct nlattr *nest;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!nf_ft))
+ return 0;
+
+ if (!nft_is_active(net, ft))
+ return 0;
+
+ nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name,
+ ft->name, ft->table->family);
+ if (ret) {
+ nla_nest_cancel(nlskb, nest);
+ return ret;
+ }
+
+ nla_nest_end(nlskb, nest);
+ return 0;
+}
+
+static int nfnl_hook_dump_one(struct sk_buff *nlskb,
+ const struct nfnl_dump_hook_data *ctx,
+ const struct nf_hook_ops *ops,
+ int family, unsigned int seq)
+{
+ u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET);
+ unsigned int portid = NETLINK_CB(nlskb).portid;
+ struct nlmsghdr *nlh;
+ int ret = -EMSGSIZE;
+ u32 hooknum;
+#ifdef CONFIG_KALLSYMS
+ char sym[KSYM_SYMBOL_LEN];
+ char *module_name;
+#endif
+ nlh = nfnl_msg_put(nlskb, portid, seq, event,
+ NLM_F_MULTI, family, NFNETLINK_V0, 0);
+ if (!nlh)
+ goto nla_put_failure;
+
+#ifdef CONFIG_KALLSYMS
+ ret = snprintf(sym, sizeof(sym), "%ps", ops->hook);
+ if (ret >= sizeof(sym)) {
+ ret = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ module_name = strstr(sym, " [");
+ if (module_name) {
+ char *end;
+
+ *module_name = '\0';
+ module_name += 2;
+ end = strchr(module_name, ']');
+ if (end) {
+ *end = 0;
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_MODULE_NAME, module_name);
+ if (ret)
+ goto nla_put_failure;
+ }
+ }
+
+ ret = nla_put_string(nlskb, NFNLA_HOOK_FUNCTION_NAME, sym);
+ if (ret)
+ goto nla_put_failure;
+#endif
+
+ if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS)
+ hooknum = NF_NETDEV_INGRESS;
+ else
+ hooknum = ops->hooknum;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum));
+ if (ret)
+ goto nla_put_failure;
+
+ ret = nla_put_be32(nlskb, NFNLA_HOOK_PRIORITY, htonl(ops->priority));
+ if (ret)
+ goto nla_put_failure;
+
+ switch (ops->hook_ops_type) {
+ case NF_HOOK_OP_NF_TABLES:
+ ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_BPF:
+ ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_NFT_FT:
+ ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv);
+ break;
+ case NF_HOOK_OP_UNDEFINED:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (ret)
+ goto nla_put_failure;
+
+ nlmsg_end(nlskb, nlh);
+ return 0;
+nla_put_failure:
+ nlmsg_trim(nlskb, nlh);
+ return ret;
+}
+
+static const struct nf_hook_entries *
+nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev)
+{
+ const struct nf_hook_entries *hook_head = NULL;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ struct net_device *netdev;
+#endif
+
+ switch (pf) {
+ case NFPROTO_IPV4:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv4))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
+ break;
+ case NFPROTO_IPV6:
+ if (hook >= ARRAY_SIZE(net->nf.hooks_ipv6))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
+ break;
+ case NFPROTO_ARP:
+#ifdef CONFIG_NETFILTER_FAMILY_ARP
+ if (hook >= ARRAY_SIZE(net->nf.hooks_arp))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
+#endif
+ break;
+ case NFPROTO_BRIDGE:
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ if (hook >= ARRAY_SIZE(net->nf.hooks_bridge))
+ return ERR_PTR(-EINVAL);
+ hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+#endif
+ break;
+#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)
+ case NFPROTO_NETDEV:
+ if (hook >= NF_NETDEV_NUMHOOKS)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ netdev = dev_get_by_name_rcu(net, dev);
+ if (!netdev)
+ return ERR_PTR(-ENODEV);
+
+#ifdef CONFIG_NETFILTER_INGRESS
+ if (hook == NF_NETDEV_INGRESS)
+ return rcu_dereference(netdev->nf_hooks_ingress);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ if (hook == NF_NETDEV_EGRESS)
+ return rcu_dereference(netdev->nf_hooks_egress);
+#endif
+ fallthrough;
+#endif
+ default:
+ return ERR_PTR(-EPROTONOSUPPORT);
+ }
+
+ return hook_head;
+}
+
+static int nfnl_hook_dump(struct sk_buff *nlskb,
+ struct netlink_callback *cb)
+{
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nfnl_dump_hook_data *ctx = cb->data;
+ int err, family = nfmsg->nfgen_family;
+ struct net *net = sock_net(nlskb->sk);
+ struct nf_hook_ops * const *ops;
+ const struct nf_hook_entries *e;
+ unsigned int i = cb->args[0];
+
+ rcu_read_lock();
+
+ e = nfnl_hook_entries_head(family, ctx->hook, net, ctx->devname);
+ if (!e)
+ goto done;
+
+ if (IS_ERR(e)) {
+ cb->seq++;
+ goto done;
+ }
+
+ if ((unsigned long)e != ctx->headv || i >= e->num_hook_entries)
+ cb->seq++;
+
+ ops = nf_hook_entries_get_hook_ops(e);
+
+ for (; i < e->num_hook_entries; i++) {
+ err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family,
+ cb->nlh->nlmsg_seq);
+ if (err)
+ break;
+ }
+
+done:
+ nl_dump_check_consistent(cb, nlmsg_hdr(nlskb));
+ rcu_read_unlock();
+ cb->args[0] = i;
+ return nlskb->len;
+}
+
+static int nfnl_hook_dump_start(struct netlink_callback *cb)
+{
+ const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ const struct nlattr * const *nla = cb->data;
+ struct nfnl_dump_hook_data *ctx = NULL;
+ struct net *net = sock_net(cb->skb->sk);
+ u8 family = nfmsg->nfgen_family;
+ char name[IFNAMSIZ] = "";
+ const void *head;
+ u32 hooknum;
+
+ hooknum = ntohl(nla_get_be32(nla[NFNLA_HOOK_HOOKNUM]));
+ if (hooknum > 255)
+ return -EINVAL;
+
+ if (family == NFPROTO_NETDEV) {
+ if (!nla[NFNLA_HOOK_DEV])
+ return -EINVAL;
+
+ nla_strscpy(name, nla[NFNLA_HOOK_DEV], sizeof(name));
+ }
+
+ rcu_read_lock();
+ /* Not dereferenced; for consistency check only */
+ head = nfnl_hook_entries_head(family, hooknum, net, name);
+ rcu_read_unlock();
+
+ if (head && IS_ERR(head))
+ return PTR_ERR(head);
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ strscpy(ctx->devname, name, sizeof(ctx->devname));
+ ctx->headv = (unsigned long)head;
+ ctx->hook = hooknum;
+
+ cb->seq = 1;
+ cb->data = ctx;
+
+ return 0;
+}
+
+static int nfnl_hook_dump_stop(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
+static int nfnl_hook_get(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nla[])
+{
+ if (!nla[NFNLA_HOOK_HOOKNUM])
+ return -EINVAL;
+
+ if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+ struct netlink_dump_control c = {
+ .start = nfnl_hook_dump_start,
+ .done = nfnl_hook_dump_stop,
+ .dump = nfnl_hook_dump,
+ .module = THIS_MODULE,
+ .data = (void *)nla,
+ };
+
+ return nf_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const struct nfnl_callback nfnl_hook_cb[NFNL_MSG_HOOK_MAX] = {
+ [NFNL_MSG_HOOK_GET] = {
+ .call = nfnl_hook_get,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFNLA_HOOK_MAX,
+ .policy = nfnl_hook_nla_policy
+ },
+};
+
+static const struct nfnetlink_subsystem nfhook_subsys = {
+ .name = "nfhook",
+ .subsys_id = NFNL_SUBSYS_HOOK,
+ .cb_count = NFNL_MSG_HOOK_MAX,
+ .cb = nfnl_hook_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_HOOK);
+
+static int __init nfnetlink_hook_init(void)
+{
+ return nfnetlink_subsys_register(&nfhook_subsys);
+}
+
+static void __exit nfnetlink_hook_exit(void)
+{
+ nfnetlink_subsys_unregister(&nfhook_subsys);
+}
+
+module_init(nfnetlink_hook_init);
+module_exit(nfnetlink_hook_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("nfnetlink_hook: list registered netfilter hooks");
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b1f9c5303f02..bfcb9cd335bf 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for logging packets to userspace via
* nfetlink.
@@ -7,10 +8,6 @@
*
* Based on the old ipv4-only ipt_ULOG.c:
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -46,6 +43,10 @@
#include "../bridge/br_private.h"
#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+
#define NFULNL_COPY_DISABLED 0xff
#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
@@ -65,6 +66,7 @@ struct nfulnl_instance {
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
struct net *net;
+ netns_tracker ns_tracker;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
u32 peer_portid; /* PORTID of the peer process */
@@ -101,9 +103,9 @@ static inline u_int8_t instance_hashfn(u_int16_t group_num)
}
static struct nfulnl_instance *
-__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
+__instance_lookup(const struct nfnl_log_net *log, u16 group_num)
{
- struct hlist_head *head;
+ const struct hlist_head *head;
struct nfulnl_instance *inst;
head = &log->instance_table[instance_hashfn(group_num)];
@@ -121,15 +123,25 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
+instance_lookup_get_rcu(const struct nfnl_log_net *log, u16 group_num)
{
struct nfulnl_instance *inst;
- rcu_read_lock_bh();
inst = __instance_lookup(log, group_num);
if (inst && !refcount_inc_not_zero(&inst->use))
inst = NULL;
- rcu_read_unlock_bh();
+
+ return inst;
+}
+
+static struct nfulnl_instance *
+instance_lookup_get(const struct nfnl_log_net *log, u16 group_num)
+{
+ struct nfulnl_instance *inst;
+
+ rcu_read_lock();
+ inst = instance_lookup_get_rcu(log, group_num);
+ rcu_read_unlock();
return inst;
}
@@ -139,7 +151,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head)
struct nfulnl_instance *inst =
container_of(head, struct nfulnl_instance, rcu);
- put_net(inst->net);
+ put_net_track(inst->net, &inst->ns_tracker);
kfree(inst);
module_put(THIS_MODULE);
}
@@ -186,7 +198,7 @@ instance_create(struct net *net, u_int16_t group_num,
timer_setup(&inst->timer, nfulnl_timer, 0);
- inst->net = get_net(net);
+ inst->net = get_net_track(net, &inst->ns_tracker, GFP_ATOMIC);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -359,8 +371,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
goto out;
}
}
- nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
- MSG_DONTWAIT);
+ nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid);
out:
inst->qlen = 0;
inst->skb = NULL;
@@ -370,7 +381,7 @@ static void
__nfulnl_flush(struct nfulnl_instance *inst)
{
/* timer holds a reference */
- if (del_timer(&inst->timer))
+ if (timer_delete(&inst->timer))
instance_put(inst);
if (inst->skb)
__nfulnl_send(inst);
@@ -379,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst)
static void
nfulnl_timer(struct timer_list *t)
{
- struct nfulnl_instance *inst = from_timer(inst, t, timer);
+ struct nfulnl_instance *inst = timer_container_of(inst, t, timer);
spin_lock_bh(&inst->lock);
if (inst->skb)
@@ -388,6 +399,57 @@ nfulnl_timer(struct timer_list *t)
instance_put(inst);
}
+static u32 nfulnl_get_bridge_size(const struct sk_buff *skb)
+{
+ u32 size = 0;
+
+ if (!skb_mac_header_was_set(skb))
+ return 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ size += nla_total_size(0); /* nested */
+ size += nla_total_size(sizeof(u16)); /* id */
+ size += nla_total_size(sizeof(u16)); /* tag */
+ }
+
+ if (skb->network_header > skb->mac_header)
+ size += nla_total_size(skb->network_header - skb->mac_header);
+
+ return size;
+}
+
+static int nfulnl_put_bridge(struct nfulnl_instance *inst, const struct sk_buff *skb)
+{
+ if (!skb_mac_header_was_set(skb))
+ return 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(inst->skb, NFULA_VLAN);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_be16(inst->skb, NFULA_VLAN_TCI, htons(skb->vlan_tci)) ||
+ nla_put_be16(inst->skb, NFULA_VLAN_PROTO, skb->vlan_proto))
+ goto nla_put_failure;
+
+ nla_nest_end(inst->skb, nest);
+ }
+
+ if (skb->mac_header < skb->network_header) {
+ int len = (int)(skb->network_header - skb->mac_header);
+
+ if (nla_put(inst->skb, NFULA_L2HDR, len, skb_mac_header(skb)))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
/* This is an inline function, we don't really care about a long
* list of arguments */
static inline int
@@ -405,20 +467,15 @@ __build_packet_message(struct nfnl_log_net *log,
{
struct nfulnl_msg_packet_hdr pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
sk_buff_data_t old_tail = inst->skb->tail;
struct sock *sk;
const unsigned char *hwhdrp;
- nlh = nlmsg_put(inst->skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(inst->skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
+ 0, pf, NFNETLINK_V0, htons(inst->group_num));
if (!nlh)
return -1;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(inst->group_num);
memset(&pmsg, 0, sizeof(pmsg));
pmsg.hw_protocol = skb->protocol;
@@ -451,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
} else {
- struct net_device *physindev;
+ int physinif;
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -459,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(indev->ifindex)))
goto nla_put_failure;
- physindev = nf_bridge_get_physindev(skb);
- if (physindev &&
+ physinif = nf_bridge_get_physinif(skb);
+ if (physinif &&
nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(physindev->ifindex)))
+ htonl(physinif)))
goto nla_put_failure;
}
#endif
@@ -510,7 +567,8 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
if (indev && skb->dev &&
- skb->mac_header != skb->network_header) {
+ skb_mac_header_was_set(skb) &&
+ skb_mac_header_len(skb) != 0) {
struct nfulnl_msg_packet_hw phw;
int len;
@@ -540,9 +598,9 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure;
}
- if (skb->tstamp) {
+ if (hooknum <= NF_INET_FORWARD) {
+ struct timespec64 kts = ktime_to_timespec64(skb_tstamp_cond(skb, true));
struct nfulnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -583,6 +641,10 @@ __build_packet_message(struct nfnl_log_net *log,
NFULA_CT, NFULA_CT_INFO) < 0)
goto nla_put_failure;
+ if ((pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE) &&
+ nfulnl_put_bridge(inst, skb) < 0)
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
int size = nla_attr_size(data_len);
@@ -636,15 +698,15 @@ nfulnl_log_packet(struct net *net,
unsigned int plen = 0;
struct nfnl_log_net *log = nfnl_log_pernet(net);
const struct nfnl_ct_hook *nfnl_ct = NULL;
+ enum ip_conntrack_info ctinfo = 0;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(log, li->u.ulog.group);
+ inst = instance_lookup_get_rcu(log, li->u.ulog.group);
if (!inst)
return;
@@ -654,7 +716,7 @@ nfulnl_log_packet(struct net *net,
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -671,7 +733,7 @@ nfulnl_log_packet(struct net *net,
+ nla_total_size(sizeof(struct nfgenmsg)); /* NLMSG_DONE */
if (in && skb_mac_header_was_set(skb)) {
- size += nla_total_size(skb->dev->hard_header_len)
+ size += nla_total_size(skb->dev->hard_header_len)
+ nla_total_size(sizeof(u_int16_t)) /* hwtype */
+ nla_total_size(sizeof(u_int16_t)); /* hwlen */
}
@@ -682,14 +744,18 @@ nfulnl_log_packet(struct net *net,
size += nla_total_size(sizeof(u_int32_t));
if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL)
size += nla_total_size(sizeof(u_int32_t));
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (inst->flags & NFULNL_CFG_F_CONNTRACK) {
nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(skb, &ctinfo);
+ ct = nf_ct_get(skb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
+ if (pf == NFPROTO_NETDEV || pf == NFPROTO_BRIDGE)
+ size += nfulnl_get_bridge_size(skb);
qthreshold = inst->qthreshold;
/* per-rule qthreshold overrides per-instance */
@@ -791,10 +857,8 @@ static struct notifier_block nfulnl_rtnl_notifier = {
.notifier_call = nfulnl_rcv_nl_event,
};
-static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
return -ENOTSUPP;
}
@@ -815,29 +879,26 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
[NFULA_CFG_FLAGS] = { .type = NLA_U16 },
};
-static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfula[],
- struct netlink_ext_ack *extack)
+static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfula[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t group_num = ntohs(nfmsg->res_id);
- struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(info->net);
+ u_int16_t group_num = ntohs(info->nfmsg->res_id);
struct nfulnl_msg_config_cmd *cmd = NULL;
- struct nfnl_log_net *log = nfnl_log_pernet(net);
- int ret = 0;
+ struct nfulnl_instance *inst;
u16 flags = 0;
+ int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
- u_int8_t pf = nfmsg->nfgen_family;
+ u_int8_t pf = info->nfmsg->nfgen_family;
cmd = nla_data(nfula[NFULA_CFG_CMD]);
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(net, pf, &nfulnl_logger);
+ return nf_log_bind_pf(info->net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(net, pf);
+ nf_log_unbind_pf(info->net, pf);
return 0;
}
}
@@ -878,7 +939,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
goto out_put;
}
- inst = instance_create(net, group_num,
+ inst = instance_create(info->net, group_num,
NETLINK_CB(skb).portid,
sk_user_ns(NETLINK_CB(skb).sk));
if (IS_ERR(inst)) {
@@ -939,11 +1000,17 @@ out:
}
static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = {
- [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp,
- .attr_count = NFULA_MAX, },
- [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config,
- .attr_count = NFULA_CFG_MAX,
- .policy = nfula_cfg_policy },
+ [NFULNL_MSG_PACKET] = {
+ .call = nfulnl_recv_unsupp,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_MAX,
+ },
+ [NFULNL_MSG_CONFIG] = {
+ .call = nfulnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFULA_CFG_MAX,
+ .policy = nfula_cfg_policy
+ },
};
static const struct nfnetlink_subsystem nfulnl_subsys = {
@@ -971,7 +1038,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
struct hlist_head *head = &log->instance_table[st->bucket];
if (!hlist_empty(head))
- return rcu_dereference_bh(hlist_first_rcu(head));
+ return rcu_dereference(hlist_first_rcu(head));
}
return NULL;
}
@@ -979,7 +1046,7 @@ static struct hlist_node *get_first(struct net *net, struct iter_state *st)
static struct hlist_node *get_next(struct net *net, struct iter_state *st,
struct hlist_node *h)
{
- h = rcu_dereference_bh(hlist_next_rcu(h));
+ h = rcu_dereference(hlist_next_rcu(h));
while (!h) {
struct nfnl_log_net *log;
struct hlist_head *head;
@@ -989,7 +1056,7 @@ static struct hlist_node *get_next(struct net *net, struct iter_state *st,
log = nfnl_log_pernet(net);
head = &log->instance_table[st->bucket];
- h = rcu_dereference_bh(hlist_first_rcu(head));
+ h = rcu_dereference(hlist_first_rcu(head));
}
return h;
}
@@ -1007,9 +1074,9 @@ static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
}
static void *seq_start(struct seq_file *s, loff_t *pos)
- __acquires(rcu_bh)
+ __acquires(rcu)
{
- rcu_read_lock_bh();
+ rcu_read_lock();
return get_idx(seq_file_net(s), s->private, *pos);
}
@@ -1020,9 +1087,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(rcu_bh)
+ __releases(rcu)
{
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
static int seq_show(struct seq_file *s, void *v)
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index 6f41dd74729d..c0fc431991e8 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/kernel.h>
@@ -32,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
{
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
const struct iphdr *ip = ip_hdr(skb);
+ const struct in_ifaddr *ifa;
int ret = 0;
if (ttl_check == NF_OSF_TTL_TRUE)
@@ -41,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
else if (ip->ttl <= f_ttl)
return 1;
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(ip->saddr, ifa)) {
ret = (ip->ttl == f_ttl);
break;
}
}
- endfor_ifa(in_dev);
-
return ret;
}
@@ -66,6 +66,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
int ttl_check,
struct nf_osf_hdr_ctx *ctx)
{
+ const __u8 *optpinit = ctx->optp;
unsigned int check_WSS = 0;
int fmatch = FMATCH_WRONG;
int foptsize, optnum;
@@ -155,18 +156,21 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
}
}
+ if (fmatch != FMATCH_OK)
+ ctx->optp = optpinit;
+
return fmatch == FMATCH_OK;
}
static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
const struct sk_buff *skb,
const struct iphdr *ip,
- unsigned char *opts)
+ unsigned char *opts,
+ struct tcphdr *_tcph)
{
const struct tcphdr *tcp;
- struct tcphdr _tcph;
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph);
if (!tcp)
return NULL;
@@ -182,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
sizeof(struct tcphdr), ctx->optsize, opts);
+ if (!ctx->optp)
+ return NULL;
}
return tcp;
@@ -201,10 +207,11 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
int fmatch = FMATCH_WRONG;
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
+ struct tcphdr _tcph;
memset(&ctx, 0, sizeof(ctx));
- tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);
if (!tcp)
return false;
@@ -251,9 +258,9 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
}
EXPORT_SYMBOL_GPL(nf_osf_match);
-const char *nf_osf_find(const struct sk_buff *skb,
- const struct list_head *nf_osf_fingers,
- const int ttl_check)
+bool nf_osf_find(const struct sk_buff *skb,
+ const struct list_head *nf_osf_fingers,
+ const int ttl_check, struct nf_osf_data *data)
{
const struct iphdr *ip = ip_hdr(skb);
const struct nf_osf_user_finger *f;
@@ -261,24 +268,27 @@ const char *nf_osf_find(const struct sk_buff *skb,
const struct nf_osf_finger *kf;
struct nf_osf_hdr_ctx ctx;
const struct tcphdr *tcp;
- const char *genre = NULL;
+ struct tcphdr _tcph;
+ bool found = false;
memset(&ctx, 0, sizeof(ctx));
- tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph);
if (!tcp)
- return NULL;
+ return false;
list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
f = &kf->finger;
if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
continue;
- genre = f->genre;
+ data->genre = f->genre;
+ data->version = f->version;
+ found = true;
break;
}
- return genre;
+ return found;
}
EXPORT_SYMBOL_GPL(nf_osf_find);
@@ -286,10 +296,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
[OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
};
-static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_add_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *kf = NULL, *sf;
@@ -301,11 +310,19 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
if (!osf_attrs[OSF_ATTR_FINGER])
return -EINVAL;
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
return -EINVAL;
f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+ if (f->opt_num > ARRAY_SIZE(f->opt))
+ return -EINVAL;
+
+ if (!memchr(f->genre, 0, MAXGENRELEN) ||
+ !memchr(f->subtype, 0, MAXGENRELEN) ||
+ !memchr(f->version, 0, MAXGENRELEN))
+ return -EINVAL;
+
kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
if (!kf)
return -ENOMEM;
@@ -319,7 +336,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
kfree(kf);
kf = NULL;
- if (nlh->nlmsg_flags & NLM_F_EXCL)
+ if (info->nlh->nlmsg_flags & NLM_F_EXCL)
err = -EEXIST;
break;
}
@@ -333,11 +350,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
return err;
}
-static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
+static int nfnl_osf_remove_callback(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const osf_attrs[])
{
struct nf_osf_user_finger *f;
struct nf_osf_finger *sf;
@@ -371,11 +386,13 @@ static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
[OSF_MSG_ADD] = {
.call = nfnl_osf_add_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
[OSF_MSG_REMOVE] = {
.call = nfnl_osf_remove_callback,
+ .type = NFNL_CB_MUTEX,
.attr_count = OSF_ATTR_MAX,
.policy = nfnl_osf_policy,
},
@@ -430,3 +447,5 @@ module_init(nfnl_osf_init);
module_exit(nfnl_osf_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Passive OS fingerprint matching");
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 0dcc3592d053..8b7b39d8a109 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for queueing packets and communicating with
* userspace via nfnetlink.
@@ -8,11 +9,6 @@
* Based on the old ipv4-only ip_queue.c:
* (C) 2000-2002 James Morris <jmorris@intercode.com.au>
* (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,6 +29,8 @@
#include <linux/netfilter/nfnetlink_queue.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
+#include <linux/cgroup-defs.h>
+#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
@@ -171,7 +169,9 @@ instance_destroy_rcu(struct rcu_head *head)
struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
rcu);
+ rcu_read_lock();
nfqnl_flush(inst, NULL, 0);
+ rcu_read_unlock();
kfree(inst);
module_put(THIS_MODULE);
}
@@ -227,22 +227,174 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
return entry;
}
-static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+static unsigned int nf_iterate(struct sk_buff *skb,
+ struct nf_hook_state *state,
+ const struct nf_hook_entries *hooks,
+ unsigned int *index)
+{
+ const struct nf_hook_entry *hook;
+ unsigned int verdict, i = *index;
+
+ while (i < hooks->num_hook_entries) {
+ hook = &hooks->hooks[i];
+repeat:
+ verdict = nf_hook_entry_hookfn(hook, skb, state);
+ if (verdict != NF_ACCEPT) {
+ *index = i;
+ if (verdict != NF_REPEAT)
+ return verdict;
+ goto repeat;
+ }
+ i++;
+ }
+
+ *index = i;
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
+{
+ switch (pf) {
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ case NFPROTO_BRIDGE:
+ return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+#endif
+ case NFPROTO_IPV4:
+ return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
+ case NFPROTO_IPV6:
+ return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
+ default:
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ return NULL;
+}
+
+static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
+{
+#ifdef CONFIG_INET
+ const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
+
+ if (entry->state.hook == NF_INET_LOCAL_OUT) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
+ return ip_route_me_harder(entry->state.net, entry->state.sk,
+ skb, RTN_UNSPEC);
+ }
+#endif
+ return 0;
+}
+
+static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ const struct nf_ipv6_ops *v6ops;
+ int ret = 0;
+
+ switch (entry->state.pf) {
+ case AF_INET:
+ ret = nf_ip_reroute(skb, entry);
+ break;
+ case AF_INET6:
+ v6ops = rcu_dereference(nf_ipv6_ops);
+ if (v6ops)
+ ret = v6ops->reroute(skb, entry);
+ break;
+ }
+ return ret;
+}
+
+/* caller must hold rcu read-side lock */
+static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
- struct nf_ct_hook *ct_hook;
+ const struct nf_hook_entry *hook_entry;
+ const struct nf_hook_entries *hooks;
+ struct sk_buff *skb = entry->skb;
+ const struct net *net;
+ unsigned int i;
int err;
+ u8 pf;
+
+ net = entry->state.net;
+ pf = entry->state.pf;
+
+ hooks = nf_hook_entries_head(net, pf, entry->state.hook);
+
+ i = entry->hook_index;
+ if (!hooks || i >= hooks->num_hook_entries) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
+ nf_queue_entry_free(entry);
+ return;
+ }
+
+ hook_entry = &hooks->hooks[i];
+
+ /* Continue traversal iff userspace said ok... */
+ if (verdict == NF_REPEAT)
+ verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
+
+ if (verdict == NF_ACCEPT) {
+ if (nf_reroute(skb, entry) < 0)
+ verdict = NF_DROP;
+ }
+
+ if (verdict == NF_ACCEPT) {
+next_hook:
+ ++i;
+ verdict = nf_iterate(skb, &entry->state, hooks, &i);
+ }
+
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ case NF_STOP:
+ local_bh_disable();
+ entry->state.okfn(entry->state.net, entry->state.sk, skb);
+ local_bh_enable();
+ break;
+ case NF_QUEUE:
+ err = nf_queue(skb, &entry->state, i, verdict);
+ if (err == 1)
+ goto next_hook;
+ break;
+ case NF_STOLEN:
+ break;
+ default:
+ kfree_skb(skb);
+ }
+
+ nf_queue_entry_free(entry);
+}
+
+static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+{
+ const struct nf_ct_hook *ct_hook;
if (verdict == NF_ACCEPT ||
verdict == NF_REPEAT ||
verdict == NF_STOP) {
+ unsigned int ct_verdict = verdict;
+
rcu_read_lock();
ct_hook = rcu_dereference(nf_ct_hook);
- if (ct_hook) {
- err = ct_hook->update(entry->state.net, entry->skb);
- if (err < 0)
- verdict = NF_DROP;
- }
+ if (ct_hook)
+ ct_verdict = ct_hook->update(entry->state.net, entry->skb);
rcu_read_unlock();
+
+ switch (ct_verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ /* follow userspace verdict, could be REPEAT */
+ break;
+ case NF_STOLEN:
+ nf_queue_entry_free(entry);
+ return;
+ default:
+ verdict = ct_verdict & NF_VERDICT_MASK;
+ break;
+ }
}
nf_reinject(entry, verdict);
}
@@ -305,18 +457,31 @@ nla_put_failure:
return -1;
}
-static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
{
- u32 seclen = 0;
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ if (sk && sk_fullsock(sk)) {
+ u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+
+ if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
+{
+ int seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+
if (!skb || !sk_fullsock(skb->sk))
return 0;
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->secmark)
- security_secid_to_secctx(skb->secmark, secdata, &seclen);
-
+ seclen = security_secid_to_secctx(skb->secmark, ctx);
read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
return seclen;
@@ -351,7 +516,7 @@ static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
if (skb_vlan_tag_present(entskb)) {
struct nlattr *nest;
- nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED);
+ nest = nla_nest_start(skb, NFQA_VLAN);
if (!nest)
goto nla_put_failure;
@@ -375,6 +540,14 @@ nla_put_failure:
return -1;
}
+static int nf_queue_checksum_help(struct sk_buff *entskb)
+{
+ if (skb_csum_is_sctp(entskb))
+ return skb_crc32c_csum_help(entskb);
+
+ return skb_checksum_help(entskb);
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
@@ -387,18 +560,18 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
struct sk_buff *entskb = entry->skb;
struct net_device *indev;
struct net_device *outdev;
struct nf_conn *ct = NULL;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
+ enum ip_conntrack_info ctinfo = 0;
+ const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
- char *secdata = NULL;
- u32 seclen = 0;
+ struct lsm_context ctx = { NULL, 0, 0 };
+ int seclen = 0;
+ ktime_t tstamp;
- size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -407,11 +580,16 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ + nla_total_size(sizeof(u_int32_t)) /* priority */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
+ nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+ + nla_total_size(sizeof(u_int32_t)) /* classid */
+#endif
+ nla_total_size(sizeof(u_int32_t)); /* cap_len */
- if (entskb->tstamp)
+ tstamp = skb_tstamp_cond(entskb, false);
+ if (tstamp)
size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
size += nfqnl_get_bridge_size(entry);
@@ -432,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
case NFQNL_COPY_PACKET:
if (!(queue->flags & NFQA_CFG_F_GSO) &&
entskb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(entskb))
+ nf_queue_checksum_help(entskb))
return NULL;
data_len = READ_ONCE(queue->copy_range);
@@ -448,21 +626,25 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfnl_ct = rcu_dereference(nfnl_ct_hook);
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
if (queue->flags & NFQA_CFG_F_CONNTRACK) {
if (nfnl_ct != NULL) {
- ct = nfnl_ct->get_ct(entskb, &ctinfo);
+ ct = nf_ct_get(entskb, &ctinfo);
if (ct != NULL)
size += nfnl_ct->build_size(ct);
}
}
+#endif
if (queue->flags & NFQA_CFG_F_UID_GID) {
- size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ size += (nla_total_size(sizeof(u_int32_t)) /* uid */
+ nla_total_size(sizeof(u_int32_t))); /* gid */
}
if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
- seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ seclen = nfqnl_get_sk_secctx(entskb, &ctx);
+ if (seclen < 0)
+ return NULL;
if (seclen)
size += nla_total_size(seclen);
}
@@ -473,18 +655,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
goto nlmsg_failure;
}
- nlh = nlmsg_put(skb, 0, 0,
- nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
- sizeof(struct nfgenmsg), 0);
+ nlh = nfnl_msg_put(skb, 0, 0,
+ nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
+ 0, entry->state.pf, NFNETLINK_V0,
+ htons(queue->queue_num));
if (!nlh) {
skb_tx_error(entskb);
kfree_skb(skb);
goto nlmsg_failure;
}
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = entry->state.pf;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(queue->queue_num);
nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
pmsg = nla_data(nla);
@@ -565,8 +744,13 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
goto nla_put_failure;
+ if (entskb->priority &&
+ nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
+ goto nla_put_failure;
+
if (indev && entskb->dev &&
- entskb->mac_header != entskb->network_header) {
+ skb_mac_header_was_set(entskb) &&
+ skb_mac_header_len(entskb) != 0) {
struct nfqnl_msg_packet_hw phw;
int len;
@@ -582,9 +766,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure;
- if (entskb->tstamp) {
+ if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
struct nfqnl_msg_packet_timestamp ts;
- struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
+ struct timespec64 kts = ktime_to_timespec64(tstamp);
ts.sec = cpu_to_be64(kts.tv_sec);
ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);
@@ -597,7 +781,10 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
+ goto nla_put_failure;
+
+ if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context))
goto nla_put_failure;
if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
@@ -625,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
nlh->nlmsg_len = skb->len;
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return skb;
nla_put_failure:
@@ -634,8 +821,8 @@ nla_put_failure:
kfree_skb(skb);
net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return NULL;
}
@@ -643,10 +830,41 @@ static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- const struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+ unsigned long status;
+ unsigned int use;
- if (ct && ((ct->status & flags) == IPS_DYING))
+ if (!ct)
+ return false;
+
+ status = READ_ONCE(ct->status);
+ if ((status & flags) == IPS_DYING)
return true;
+
+ if (status & IPS_CONFIRMED)
+ return false;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * We can't be solved with serialization here because one clone could
+ * have been queued for local delivery.
+ */
+ use = refcount_read(&ct->ct_general.use);
+ if (likely(use == 1))
+ return false;
+
+ /* Can't decrement further? Exclusive ownership. */
+ if (!refcount_dec_not_one(&ct->ct_general.use))
+ return false;
+
+ skb_set_nfct(entry->skb, 0);
+ /* No nf_ct_put(): we already decremented .use and it cannot
+ * drop down to 0.
+ */
+ return true;
#endif
return false;
}
@@ -685,7 +903,7 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
*packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid);
if (err < 0) {
if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
failopen = 1;
@@ -715,9 +933,15 @@ static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_entry_get_refs(entry);
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -741,12 +965,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif
-static void free_entry(struct nf_queue_entry *entry)
-{
- nf_queue_entry_release_refs(entry);
- kfree(entry);
-}
-
static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
struct sk_buff *skb, struct nf_queue_entry *entry)
@@ -772,7 +990,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
entry_seg->skb = skb;
ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
if (ret)
- free_entry(entry_seg);
+ nf_queue_entry_free(entry_seg);
}
return ret;
}
@@ -782,7 +1000,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
unsigned int queued;
struct nfqnl_instance *queue;
- struct sk_buff *skb, *segs;
+ struct sk_buff *skb, *segs, *nskb;
int err = -ENOBUFS;
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -806,7 +1024,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
break;
}
- if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb))
+ if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
return __nfqnl_enqueue_packet(net, queue, entry);
nf_bridge_adjust_skb_data(skb);
@@ -819,8 +1037,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
goto out_err;
queued = 0;
err = 0;
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
if (err == 0)
err = __nfqnl_enqueue_packet_gso(net, queue,
segs, entry);
@@ -828,12 +1045,11 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
queued++;
else
kfree_skb(segs);
- segs = nskb;
- } while (segs);
+ }
if (queued) {
if (err) /* some segments are already queued */
- free_entry(entry);
+ nf_queue_entry_free(entry);
kfree_skb(skb);
return 0;
}
@@ -843,11 +1059,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
@@ -863,7 +1084,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
}
skb_put(e->skb, diff);
}
- if (!skb_make_writable(e->skb, data_len))
+ if (skb_ensure_writable(e->skb, data_len))
return -ENOMEM;
skb_copy_to_linear_data(e->skb, data, data_len);
e->skb->ip_summed = CHECKSUM_NONE;
@@ -965,6 +1186,16 @@ static void nfqnl_nf_hook_drop(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int i;
+ /* This function is also called on net namespace error unwind,
+ * when pernet_ops->init() failed and ->exit() functions of the
+ * previous pernet_ops gets called.
+ *
+ * This may result in a call to nfqnl_nf_hook_drop() before
+ * struct nfnl_queue_net was allocated.
+ */
+ if (!q)
+ return;
+
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;
struct hlist_head *head = &q->instance_table[i];
@@ -1017,11 +1248,13 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
[NFQA_CT] = { .type = NLA_UNSPEC },
[NFQA_EXP] = { .type = NLA_UNSPEC },
[NFQA_VLAN] = { .type = NLA_NESTED },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
[NFQA_MARK] = { .type = NLA_U32 },
+ [NFQA_PRIORITY] = { .type = NLA_U32 },
};
static struct nfqnl_instance *
@@ -1060,20 +1293,17 @@ static int nfq_id_after(unsigned int id, unsigned int max)
return (int)(id - max) > 0;
}
-static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u16 queue_num = ntohs(info->nfmsg->res_id);
struct nf_queue_entry *entry, *tmp;
- unsigned int verdict, maxid;
struct nfqnl_msg_verdict_hdr *vhdr;
struct nfqnl_instance *queue;
+ unsigned int verdict, maxid;
LIST_HEAD(batch_list);
- u16 queue_num = ntohs(nfmsg->res_id);
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
queue = verdict_instance_lookup(q, queue_num,
NETLINK_CB(skb).portid);
@@ -1105,20 +1335,24 @@ static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
}
return 0;
}
-static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
+static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
const struct nlmsghdr *nlh,
const struct nlattr * const nfqa[],
struct nf_queue_entry *entry,
enum ip_conntrack_info *ctinfo)
{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct nf_conn *ct;
- ct = nfnl_ct->get_ct(entry->skb, ctinfo);
+ ct = nf_ct_get(entry->skb, ctinfo);
if (ct == NULL)
return NULL;
@@ -1130,6 +1364,9 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
NETLINK_CB(entry->skb).portid,
nlmsg_report(nlh));
return ct;
+#else
+ return NULL;
+#endif
}
static int nfqa_parse_bridge(struct nf_queue_entry *entry,
@@ -1139,8 +1376,9 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
struct nlattr *tb[NFQA_VLAN_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN],
- nfqa_vlan_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX,
+ nfqa[NFQA_VLAN],
+ nfqa_vlan_policy, NULL);
if (err < 0)
return err;
@@ -1167,22 +1405,18 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
return 0;
}
-static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
+ const struct nfnl_ct_hook *nfnl_ct;
struct nfqnl_msg_verdict_hdr *vhdr;
+ enum ip_conntrack_info ctinfo;
struct nfqnl_instance *queue;
- unsigned int verdict;
struct nf_queue_entry *entry;
- enum ip_conntrack_info uninitialized_var(ctinfo);
- struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ unsigned int verdict;
int err;
queue = verdict_instance_lookup(q, queue_num,
@@ -1205,7 +1439,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_CT]) {
if (nfnl_ct != NULL)
- ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
+ ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
+ &ctinfo);
}
if (entry->state.pf == PF_BRIDGE) {
@@ -1229,14 +1464,15 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+ if (nfqa[NFQA_PRIORITY])
+ entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));
+
nfqnl_reinject(entry, verdict);
return 0;
}
-static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const cda[])
{
return -ENOTSUPP;
}
@@ -1254,16 +1490,13 @@ static const struct nf_queue_handler nfqh = {
.nf_hook_drop = nfqnl_nf_hook_drop,
};
-static int nfqnl_recv_config(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[],
- struct netlink_ext_ack *extack)
+static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
+ const struct nlattr * const nfqa[])
{
- struct nfgenmsg *nfmsg = nlmsg_data(nlh);
- u_int16_t queue_num = ntohs(nfmsg->res_id);
- struct nfqnl_instance *queue;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
struct nfqnl_msg_config_cmd *cmd = NULL;
- struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ struct nfqnl_instance *queue;
__u32 flags = 0, mask = 0;
int ret = 0;
@@ -1382,17 +1615,29 @@ err_out_unlock:
}
static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
- [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp,
- .attr_count = NFQA_MAX, },
- [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_policy },
- [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config,
- .attr_count = NFQA_CFG_MAX,
- .policy = nfqa_cfg_policy },
- [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch,
- .attr_count = NFQA_MAX,
- .policy = nfqa_verdict_batch_policy },
+ [NFQNL_MSG_PACKET] = {
+ .call = nfqnl_recv_unsupp,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ },
+ [NFQNL_MSG_VERDICT] = {
+ .call = nfqnl_recv_verdict,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_policy
+ },
+ [NFQNL_MSG_CONFIG] = {
+ .call = nfqnl_recv_config,
+ .type = NFNL_CB_MUTEX,
+ .attr_count = NFQA_CFG_MAX,
+ .policy = nfqa_cfg_policy
+ },
+ [NFQNL_MSG_VERDICT_BATCH] = {
+ .call = nfqnl_recv_verdict_batch,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFQA_MAX,
+ .policy = nfqa_verdict_batch_policy
+ },
};
static const struct nfnetlink_subsystem nfqnl_subsys = {
@@ -1510,7 +1755,6 @@ static int __net_init nfnl_queue_net_init(struct net *net)
&nfqnl_seq_ops, sizeof(struct iter_state)))
return -ENOMEM;
#endif
- nf_register_queue_handler(net, &nfqh);
return 0;
}
@@ -1519,7 +1763,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
unsigned int i;
- nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
@@ -1527,15 +1770,9 @@ static void __net_exit nfnl_queue_net_exit(struct net *net)
WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}
-static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
-{
- synchronize_rcu();
-}
-
static struct pernet_operations nfnl_queue_net_ops = {
.init = nfnl_queue_net_init,
.exit = nfnl_queue_net_exit,
- .exit_batch = nfnl_queue_net_exit_batch,
.id = &nfnl_queue_net_id,
.size = sizeof(struct nfnl_queue_net),
};
@@ -1563,6 +1800,8 @@ static int __init nfnetlink_queue_init(void)
goto cleanup_netlink_subsys;
}
+ nf_register_queue_handler(&nfqh);
+
return status;
cleanup_netlink_subsys:
@@ -1576,6 +1815,7 @@ out:
static void __exit nfnetlink_queue_fini(void)
{
+ nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index fff8073e2a56..d550910aabec 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -16,118 +13,430 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
struct nft_bitwise {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 sreg2;
+ u8 dreg;
+ enum nft_bitwise_ops op:8;
u8 len;
struct nft_data mask;
struct nft_data xor;
+ struct nft_data data;
};
-static void nft_bitwise_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
{
- const struct nft_bitwise *priv = nft_expr_priv(expr);
- const u32 *src = &regs->data[priv->sreg];
- u32 *dst = &regs->data[priv->dreg];
unsigned int i;
- for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++)
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++)
dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i];
}
+static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
+{
+ u32 shift = priv->data.data[0];
+ unsigned int i;
+ u32 carry = 0;
+
+ for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) {
+ dst[i - 1] = (src[i - 1] << shift) | carry;
+ carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift);
+ }
+}
+
+static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
+ const struct nft_bitwise *priv)
+{
+ u32 shift = priv->data.data[0];
+ unsigned int i;
+ u32 carry = 0;
+
+ for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) {
+ dst[i] = carry | (src[i] >> shift);
+ carry = src[i] << (BITS_PER_TYPE(u32) - shift);
+ }
+}
+
+static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] & src2[i];
+}
+
+static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] | src2[i];
+}
+
+static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2,
+ const struct nft_bitwise *priv)
+{
+ unsigned int i, n;
+
+ for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++)
+ dst[i] = src[i] ^ src2[i];
+}
+
+void nft_bitwise_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const u32 *src = &regs->data[priv->sreg], *src2;
+ u32 *dst = &regs->data[priv->dreg];
+
+ if (priv->op == NFT_BITWISE_MASK_XOR) {
+ nft_bitwise_eval_mask_xor(dst, src, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_LSHIFT) {
+ nft_bitwise_eval_lshift(dst, src, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_RSHIFT) {
+ nft_bitwise_eval_rshift(dst, src, priv);
+ return;
+ }
+
+ src2 = priv->sreg2 ? &regs->data[priv->sreg2] : priv->data.data;
+
+ if (priv->op == NFT_BITWISE_AND) {
+ nft_bitwise_eval_and(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_OR) {
+ nft_bitwise_eval_or(dst, src, src2, priv);
+ return;
+ }
+ if (priv->op == NFT_BITWISE_XOR) {
+ nft_bitwise_eval_xor(dst, src, src2, priv);
+ return;
+ }
+}
+
static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
[NFTA_BITWISE_SREG] = { .type = NLA_U32 },
+ [NFTA_BITWISE_SREG2] = { .type = NLA_U32 },
[NFTA_BITWISE_DREG] = { .type = NLA_U32 },
[NFTA_BITWISE_LEN] = { .type = NLA_U32 },
[NFTA_BITWISE_MASK] = { .type = NLA_NESTED },
[NFTA_BITWISE_XOR] = { .type = NLA_NESTED },
+ [NFTA_BITWISE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BITWISE_DATA] = { .type = NLA_NESTED },
};
+static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ struct nft_data_desc mask = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->mask),
+ .len = priv->len,
+ };
+ struct nft_data_desc xor = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->xor),
+ .len = priv->len,
+ };
+ int err;
+
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->mask, &mask, tb[NFTA_BITWISE_MASK]);
+ if (err < 0)
+ return err;
+
+ err = nft_data_init(NULL, &priv->xor, &xor, tb[NFTA_BITWISE_XOR]);
+ if (err < 0)
+ goto err_xor_err;
+
+ return 0;
+
+err_xor_err:
+ nft_data_release(&priv->mask, mask.type);
+
+ return err;
+}
+
+static int nft_bitwise_init_shift(struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = sizeof(u32),
+ };
+ int err;
+
+ if (tb[NFTA_BITWISE_MASK] ||
+ tb[NFTA_BITWISE_XOR] ||
+ tb[NFTA_BITWISE_SREG2])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_DATA])
+ return -EINVAL;
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_BITWISE_DATA]);
+ if (err < 0)
+ return err;
+
+ if (priv->data.data[0] >= BITS_PER_TYPE(u32)) {
+ nft_data_release(&priv->data, desc.type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
+ struct nft_bitwise *priv,
+ const struct nlattr *const tb[])
+{
+ int err;
+
+ if (tb[NFTA_BITWISE_MASK] ||
+ tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) ||
+ (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2]))
+ return -EINVAL;
+
+ if (tb[NFTA_BITWISE_DATA]) {
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ .len = priv->len,
+ };
+
+ err = nft_data_init(NULL, &priv->data, &desc,
+ tb[NFTA_BITWISE_DATA]);
+ if (err < 0)
+ return err;
+ } else {
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2],
+ &priv->sreg2, priv->len);
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
static int nft_bitwise_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_bitwise *priv = nft_expr_priv(expr);
- struct nft_data_desc d1, d2;
u32 len;
int err;
- if (tb[NFTA_BITWISE_SREG] == NULL ||
- tb[NFTA_BITWISE_DREG] == NULL ||
- tb[NFTA_BITWISE_LEN] == NULL ||
- tb[NFTA_BITWISE_MASK] == NULL ||
- tb[NFTA_BITWISE_XOR] == NULL)
- return -EINVAL;
-
err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]);
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
if (err < 0)
return err;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
- tb[NFTA_BITWISE_MASK]);
- if (err < 0)
- return err;
- if (d1.len != priv->len) {
- err = -EINVAL;
- goto err1;
+ if (tb[NFTA_BITWISE_OP]) {
+ priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
+ switch (priv->op) {
+ case NFT_BITWISE_MASK_XOR:
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ } else {
+ priv->op = NFT_BITWISE_MASK_XOR;
}
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
- tb[NFTA_BITWISE_XOR]);
- if (err < 0)
- goto err1;
- if (d2.len != priv->len) {
- err = -EINVAL;
- goto err2;
+ switch(priv->op) {
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_init_mask_xor(priv, tb);
+ break;
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ err = nft_bitwise_init_shift(priv, tb);
+ break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_init_bool(ctx, priv, tb);
+ break;
}
- return 0;
-err2:
- nft_data_release(&priv->xor, d2.type);
-err1:
- nft_data_release(&priv->mask, d1.type);
return err;
}
-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump_mask_xor(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
+ NFT_DATA_VALUE, priv->len) < 0)
+ return -1;
+
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
+ NFT_DATA_VALUE, priv->len) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_dump_shift(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+ return 0;
+}
+
+static int nft_bitwise_dump_bool(struct sk_buff *skb,
+ const struct nft_bitwise *priv)
+{
+ if (priv->sreg2) {
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2))
+ return -1;
+ } else {
+ if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_bitwise_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_bitwise *priv = nft_expr_priv(expr);
+ int err = 0;
if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
- goto nla_put_failure;
+ return -1;
if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
- goto nla_put_failure;
+ return -1;
if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len)))
- goto nla_put_failure;
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op)))
+ return -1;
- if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask,
- NFT_DATA_VALUE, priv->len) < 0)
- goto nla_put_failure;
+ switch (priv->op) {
+ case NFT_BITWISE_MASK_XOR:
+ err = nft_bitwise_dump_mask_xor(skb, priv);
+ break;
+ case NFT_BITWISE_LSHIFT:
+ case NFT_BITWISE_RSHIFT:
+ err = nft_bitwise_dump_shift(skb, priv);
+ break;
+ case NFT_BITWISE_AND:
+ case NFT_BITWISE_OR:
+ case NFT_BITWISE_XOR:
+ err = nft_bitwise_dump_bool(skb, priv);
+ break;
+ }
- if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor,
- NFT_DATA_VALUE, priv->len) < 0)
- goto nla_put_failure;
+ return err;
+}
+
+static struct nft_data zero;
+
+static int nft_bitwise_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->op != NFT_BITWISE_MASK_XOR)
+ return -EOPNOTSUPP;
+
+ if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) ||
+ priv->sreg != priv->dreg || priv->len != reg->len)
+ return -EOPNOTSUPP;
+
+ memcpy(&reg->mask, &priv->mask, sizeof(priv->mask));
return 0;
+}
+
+static bool nft_bitwise_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise *priv = nft_expr_priv(expr);
+ const struct nft_bitwise *bitwise;
+ unsigned int regcount;
+ u8 dreg;
+ int i;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->sreg].num_reg == 0 &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->sreg2 == bitwise->sreg2 &&
+ priv->dreg == bitwise->dreg &&
+ priv->op == bitwise->op &&
+ priv->len == bitwise->len &&
+ !memcmp(&priv->mask, &bitwise->mask, sizeof(priv->mask)) &&
+ !memcmp(&priv->xor, &bitwise->xor, sizeof(priv->xor)) &&
+ !memcmp(&priv->data, &bitwise->data, sizeof(priv->data))) {
+ track->cur = expr;
+ return true;
+ }
-nla_put_failure:
- return -1;
+ if (track->regs[priv->sreg].bitwise ||
+ track->regs[priv->sreg].num_reg != 0) {
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ nft_reg_track_update(track, track->regs[priv->sreg].selector,
+ priv->dreg, priv->len);
+ }
+
+ dreg = priv->dreg;
+ regcount = DIV_ROUND_UP(priv->len, NFT_REG32_SIZE);
+ for (i = 0; i < regcount; i++, dreg++)
+ track->regs[dreg].bitwise = expr;
+
+ return false;
}
static const struct nft_expr_ops nft_bitwise_ops = {
@@ -136,12 +445,203 @@ static const struct nft_expr_ops nft_bitwise_ops = {
.eval = nft_bitwise_eval,
.init = nft_bitwise_init,
.dump = nft_bitwise_dump,
+ .reduce = nft_bitwise_reduce,
+ .offload = nft_bitwise_offload,
+};
+
+static int
+nft_bitwise_extract_u32_data(const struct nlattr * const tb, u32 *out)
+{
+ struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ .len = sizeof(u32),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &data, &desc, tb);
+ if (err < 0)
+ return err;
+
+ *out = data.data[0];
+
+ return 0;
+}
+
+static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ int err;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg,
+ sizeof(u32));
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_store(ctx, tb[NFTA_BITWISE_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_BITWISE_DATA] ||
+ tb[NFTA_BITWISE_SREG2])
+ return -EINVAL;
+
+ if (!tb[NFTA_BITWISE_MASK] ||
+ !tb[NFTA_BITWISE_XOR])
+ return -EINVAL;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_MASK], &priv->mask);
+ if (err < 0)
+ return err;
+
+ err = nft_bitwise_extract_u32_data(tb[NFTA_BITWISE_XOR], &priv->xor);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int
+nft_bitwise_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data data;
+
+ if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg))
+ return -1;
+ if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32))))
+ return -1;
+ if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR)))
+ return -1;
+
+ data.data[0] = priv->mask;
+ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ data.data[0] = priv->xor;
+ if (nft_data_dump(skb, NFTA_BITWISE_XOR, &data,
+ NFT_DATA_VALUE, sizeof(u32)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_bitwise_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ if (priv->xor || priv->sreg != priv->dreg || reg->len != sizeof(u32))
+ return -EOPNOTSUPP;
+
+ reg->mask.data[0] = priv->mask;
+ return 0;
+}
+
+static bool nft_bitwise_fast_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
+ const struct nft_bitwise_fast_expr *bitwise;
+
+ if (!track->regs[priv->sreg].selector)
+ return false;
+
+ bitwise = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (track->regs[priv->sreg].selector == track->regs[priv->dreg].selector &&
+ track->regs[priv->dreg].bitwise &&
+ track->regs[priv->dreg].bitwise->ops == expr->ops &&
+ priv->sreg == bitwise->sreg &&
+ priv->dreg == bitwise->dreg &&
+ priv->mask == bitwise->mask &&
+ priv->xor == bitwise->xor) {
+ track->cur = expr;
+ return true;
+ }
+
+ if (track->regs[priv->sreg].bitwise) {
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+ return false;
+ }
+
+ if (priv->sreg != priv->dreg) {
+ track->regs[priv->dreg].selector =
+ track->regs[priv->sreg].selector;
+ }
+ track->regs[priv->dreg].bitwise = expr;
+
+ return false;
+}
+
+const struct nft_expr_ops nft_bitwise_fast_ops = {
+ .type = &nft_bitwise_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_bitwise_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_bitwise_fast_init,
+ .dump = nft_bitwise_fast_dump,
+ .reduce = nft_bitwise_fast_reduce,
+ .offload = nft_bitwise_fast_offload,
};
+static const struct nft_expr_ops *
+nft_bitwise_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ int err;
+ u32 len;
+
+ if (!tb[NFTA_BITWISE_LEN] ||
+ !tb[NFTA_BITWISE_SREG] ||
+ !tb[NFTA_BITWISE_DREG])
+ return ERR_PTR(-EINVAL);
+
+ err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ if (len != sizeof(u32))
+ return &nft_bitwise_ops;
+
+ if (tb[NFTA_BITWISE_OP] &&
+ ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR)
+ return &nft_bitwise_ops;
+
+ return &nft_bitwise_fast_ops;
+}
+
struct nft_expr_type nft_bitwise_type __read_mostly = {
.name = "bitwise",
- .ops = &nft_bitwise_ops,
+ .select_ops = nft_bitwise_select_ops,
.policy = nft_bitwise_policy,
.maxattr = NFTA_BITWISE_MAX,
.owner = THIS_MODULE,
};
+
+bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_expr *last = track->last;
+ const struct nft_expr *next;
+
+ if (expr == last)
+ return false;
+
+ next = nft_expr_next(expr);
+ if (next->ops == &nft_bitwise_ops)
+ return nft_bitwise_reduce(track, next);
+ else if (next->ops == &nft_bitwise_fast_ops)
+ return nft_bitwise_fast_reduce(track, next);
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_expr_reduce_bitwise);
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 13d4e421a6b3..af9206a3afd1 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -1,14 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -19,41 +16,44 @@
#include <net/netfilter/nf_tables.h>
struct nft_byteorder {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
enum nft_byteorder_ops op:8;
u8 len;
u8 size;
};
-static void nft_byteorder_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_byteorder_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
u32 *src = &regs->data[priv->sreg];
u32 *dst = &regs->data[priv->dreg];
- union { u32 u32; u16 u16; } *s, *d;
+ u16 *s16, *d16;
unsigned int i;
- s = (void *)src;
- d = (void *)dst;
+ s16 = (void *)src;
+ d16 = (void *)dst;
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
- src64 = get_unaligned((u64 *)&src[i]);
- put_unaligned_be64(src64, &dst[i]);
+ src64 = nft_reg_load64(&src[i]);
+ nft_reg_store64(&dst64[i],
+ be64_to_cpu((__force __be64)src64));
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
- src64 = get_unaligned_be64(&src[i]);
- put_unaligned(src64, (u64 *)&dst[i]);
+ src64 = (__force __u64)
+ cpu_to_be64(nft_reg_load64(&src[i]));
+ nft_reg_store64(&dst64[i], src64);
}
break;
}
@@ -63,11 +63,11 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = ntohl((__force __be32)s[i].u32);
+ dst[i] = ntohl((__force __be32)src[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 4; i++)
- d[i].u32 = (__force __u32)htonl(s[i].u32);
+ dst[i] = (__force __u32)htonl(src[i]);
break;
}
break;
@@ -75,11 +75,11 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = ntohs((__force __be16)s[i].u16);
+ d16[i] = ntohs((__force __be16)s16[i]);
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 2; i++)
- d[i].u16 = (__force __u16)htons(s[i].u16);
+ d16[i] = (__force __u16)htons(s16[i]);
break;
}
break;
@@ -89,9 +89,9 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
static const struct nla_policy nft_byteorder_policy[NFTA_BYTEORDER_MAX + 1] = {
[NFTA_BYTEORDER_SREG] = { .type = NLA_U32 },
[NFTA_BYTEORDER_DREG] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_OP] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_LEN] = { .type = NLA_U32 },
- [NFTA_BYTEORDER_SIZE] = { .type = NLA_U32 },
+ [NFTA_BYTEORDER_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_BYTEORDER_SIZE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_byteorder_init(const struct nft_ctx *ctx,
@@ -133,23 +133,24 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
return -EINVAL;
}
- priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]);
err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len);
if (err < 0)
return err;
priv->len = len;
- err = nft_validate_register_load(priv->sreg, priv->len);
+ err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg,
+ priv->len);
if (err < 0)
return err;
- priv->dreg = nft_parse_register(tb[NFTA_BYTEORDER_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_byteorder *priv = nft_expr_priv(expr);
@@ -169,12 +170,23 @@ nla_put_failure:
return -1;
}
+static bool nft_byteorder_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_byteorder *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, priv->len);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_byteorder_ops = {
.type = &nft_byteorder_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_byteorder)),
.eval = nft_byteorder_eval,
.init = nft_byteorder_init,
.dump = nft_byteorder_dump,
+ .reduce = nft_byteorder_reduce,
};
struct nft_expr_type nft_byteorder_type __read_mostly = {
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 3fd540b2c6ba..b16185e9a6dd 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -18,7 +18,7 @@ static unsigned int nft_do_chain_ipv4(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -62,7 +62,7 @@ static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -102,7 +102,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
struct nft_pktinfo pkt;
nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
return nft_do_chain(&pkt, priv);
}
@@ -149,10 +149,10 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
switch (state->pf) {
case NFPROTO_IPV4:
- nft_set_pktinfo_ipv4(&pkt, skb);
+ nft_set_pktinfo_ipv4(&pkt);
break;
case NFPROTO_IPV6:
- nft_set_pktinfo_ipv6(&pkt, skb);
+ nft_set_pktinfo_ipv6(&pkt);
break;
default:
break;
@@ -161,16 +161,49 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
return nft_do_chain(&pkt, priv);
}
+static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state ingress_state = *state;
+ struct nft_pktinfo pkt;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
+ ingress_state.pf = NFPROTO_IPV4;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ case htons(ETH_P_IPV6):
+ ingress_state.pf = NFPROTO_IPV6;
+ ingress_state.hook = NF_INET_INGRESS;
+ nft_set_pktinfo(&pkt, skb, &ingress_state);
+
+ if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0)
+ return NF_DROP;
+ break;
+ default:
+ return NF_ACCEPT;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
static const struct nft_chain_type nft_chain_filter_inet = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_INET,
- .hook_mask = (1 << NF_INET_LOCAL_IN) |
+ .hook_mask = (1 << NF_INET_INGRESS) |
+ (1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_FORWARD) |
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING),
.hooks = {
+ [NF_INET_INGRESS] = nft_do_chain_inet_ingress,
[NF_INET_LOCAL_IN] = nft_do_chain_inet,
[NF_INET_LOCAL_OUT] = nft_do_chain_inet,
[NF_INET_FORWARD] = nft_do_chain_inet,
@@ -193,7 +226,7 @@ static inline void nft_chain_filter_inet_init(void) {}
static inline void nft_chain_filter_inet_fini(void) {}
#endif /* CONFIG_NF_TABLES_IPV6 */
-#ifdef CONFIG_NF_TABLES_BRIDGE
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
static unsigned int
nft_do_chain_bridge(void *priv,
struct sk_buff *skb,
@@ -205,13 +238,13 @@ nft_do_chain_bridge(void *priv,
switch (eth_hdr(skb)->h_proto) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -260,13 +293,13 @@ static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
switch (skb->protocol) {
case htons(ETH_P_IP):
- nft_set_pktinfo_ipv4_validate(&pkt, skb);
+ nft_set_pktinfo_ipv4_validate(&pkt);
break;
case htons(ETH_P_IPV6):
- nft_set_pktinfo_ipv6_validate(&pkt, skb);
+ nft_set_pktinfo_ipv6_validate(&pkt);
break;
default:
- nft_set_pktinfo_unspec(&pkt, skb);
+ nft_set_pktinfo_unspec(&pkt);
break;
}
@@ -277,72 +310,122 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
.family = NFPROTO_NETDEV,
- .hook_mask = (1 << NF_NETDEV_INGRESS),
+ .hook_mask = (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS),
.hooks = {
[NF_NETDEV_INGRESS] = nft_do_chain_netdev,
+ [NF_NETDEV_EGRESS] = nft_do_chain_netdev,
},
};
-static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_ctx *ctx)
+static int nft_netdev_event(unsigned long event, struct net_device *dev,
+ struct nft_base_chain *basechain, bool changename)
{
- struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
-
- switch (event) {
- case NETDEV_UNREGISTER:
- if (strcmp(basechain->dev_name, dev->name) != 0)
- return;
-
- /* UNREGISTER events are also happpening on netns exit.
- *
- * Altough nf_tables core releases all tables/chains, only
- * this event handler provides guarantee that
- * basechain.ops->dev is still accessible, so we cannot
- * skip exiting net namespaces.
- */
- __nft_release_basechain(ctx);
- break;
- case NETDEV_CHANGENAME:
- if (dev->ifindex != basechain->ops.dev->ifindex)
- return;
+ struct nft_table *table = basechain->chain.table;
+ struct nf_hook_ops *ops;
+ struct nft_hook *hook;
+ bool match;
+
+ list_for_each_entry(hook, &basechain->hook_list, list) {
+ ops = nft_hook_find_ops(hook, dev);
+ match = !strncmp(hook->ifname, dev->name, hook->ifnamelen);
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ /* NOP if not found or new name still matching */
+ if (!ops || (changename && match))
+ continue;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), ops);
- strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+ list_del_rcu(&ops->list);
+ kfree_rcu(ops, rcu);
+ break;
+ case NETDEV_REGISTER:
+ /* NOP if not matching or already registered */
+ if (!match || (changename && ops))
+ continue;
+
+ ops = kmemdup(&basechain->ops,
+ sizeof(struct nf_hook_ops),
+ GFP_KERNEL_ACCOUNT);
+ if (!ops)
+ return 1;
+
+ ops->dev = dev;
+
+ if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+ nf_register_net_hook(dev_net(dev), ops)) {
+ kfree(ops);
+ return 1;
+ }
+ list_add_tail_rcu(&ops->list, &hook->ops_list);
+ break;
+ }
break;
}
+ return 0;
}
-static int nf_tables_netdev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
+static int __nf_tables_netdev_event(unsigned long event,
+ struct net_device *dev,
+ bool changename)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nft_base_chain *basechain;
+ struct nftables_pernet *nft_net;
+ struct nft_chain *chain;
struct nft_table *table;
- struct nft_chain *chain, *nr;
- struct nft_ctx ctx = {
- .net = dev_net(dev),
- };
- if (event != NETDEV_UNREGISTER &&
- event != NETDEV_CHANGENAME)
- return NOTIFY_DONE;
-
- mutex_lock(&ctx.net->nft.commit_mutex);
- list_for_each_entry(table, &ctx.net->nft.tables, list) {
- if (table->family != NFPROTO_NETDEV)
+ nft_net = nft_pernet(dev_net(dev));
+ list_for_each_entry(table, &nft_net->tables, list) {
+ if (table->family != NFPROTO_NETDEV &&
+ table->family != NFPROTO_INET)
continue;
- ctx.family = table->family;
- ctx.table = table;
- list_for_each_entry_safe(chain, nr, &table->chains, list) {
+ list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_base_chain(chain))
continue;
- ctx.chain = chain;
- nft_netdev_event(event, dev, &ctx);
+ basechain = nft_base_chain(chain);
+ if (table->family == NFPROTO_INET &&
+ basechain->ops.hooknum != NF_INET_INGRESS)
+ continue;
+
+ if (nft_netdev_event(event, dev, basechain, changename))
+ return 1;
}
}
- mutex_unlock(&ctx.net->nft.commit_mutex);
+ return 0;
+}
- return NOTIFY_DONE;
+static int nf_tables_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct nftables_pernet *nft_net;
+ int ret = NOTIFY_DONE;
+
+ if (event != NETDEV_REGISTER &&
+ event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
+
+ nft_net = nft_pernet(dev_net(dev));
+ mutex_lock(&nft_net->commit_mutex);
+
+ if (event == NETDEV_CHANGENAME) {
+ if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) {
+ ret = NOTIFY_BAD;
+ goto out_unlock;
+ }
+ __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true);
+ } else if (__nf_tables_netdev_event(event, dev, false)) {
+ ret = NOTIFY_BAD;
+ }
+out_unlock:
+ mutex_unlock(&nft_net->commit_mutex);
+ return ret;
}
static struct notifier_block nf_tables_netdev_notifier = {
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
new file mode 100644
index 000000000000..40e230d8b712
--- /dev/null
+++ b/net/netfilter/nft_chain_nat.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ nft_set_pktinfo(&pkt, skb, state);
+
+ switch (state->pf) {
+#ifdef CONFIG_NF_TABLES_IPV4
+ case NFPROTO_IPV4:
+ nft_set_pktinfo_ipv4(&pkt);
+ break;
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ nft_set_pktinfo_ipv6(&pkt);
+ break;
+#endif
+ default:
+ break;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
+#ifdef CONFIG_NF_TABLES_IPV4
+static const struct nft_chain_type nft_chain_nat_ipv4 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV4,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
+ },
+ .ops_register = nf_nat_ipv4_register_fn,
+ .ops_unregister = nf_nat_ipv4_unregister_fn,
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static const struct nft_chain_type nft_chain_nat_ipv6 = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_IPV6,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
+ },
+ .ops_register = nf_nat_ipv6_register_fn,
+ .ops_unregister = nf_nat_ipv6_unregister_fn,
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_INET
+static int nft_nat_inet_reg(struct net *net, const struct nf_hook_ops *ops)
+{
+ return nf_nat_inet_register_fn(net, ops);
+}
+
+static void nft_nat_inet_unreg(struct net *net, const struct nf_hook_ops *ops)
+{
+ nf_nat_inet_unregister_fn(net, ops);
+}
+
+static const struct nft_chain_type nft_chain_nat_inet = {
+ .name = "nat",
+ .type = NFT_CHAIN_T_NAT,
+ .family = NFPROTO_INET,
+ .owner = THIS_MODULE,
+ .hook_mask = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .hooks = {
+ [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
+ [NF_INET_LOCAL_IN] = nft_nat_do_chain,
+ [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
+ [NF_INET_POST_ROUTING] = nft_nat_do_chain,
+ },
+ .ops_register = nft_nat_inet_reg,
+ .ops_unregister = nft_nat_inet_unreg,
+};
+#endif
+
+static int __init nft_chain_nat_init(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_register_chain_type(&nft_chain_nat_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_register_chain_type(&nft_chain_nat_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_register_chain_type(&nft_chain_nat_inet);
+#endif
+
+ return 0;
+}
+
+static void __exit nft_chain_nat_exit(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_unregister_chain_type(&nft_chain_nat_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_chain_type(&nft_chain_nat_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_chain_type(&nft_chain_nat_inet);
+#endif
+}
+
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("nftables network address translation support");
+#ifdef CONFIG_NF_TABLES_IPV4
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
+#endif
+#ifdef CONFIG_NF_TABLES_IPV6
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+MODULE_ALIAS_NFT_CHAIN(1, "nat"); /* NFPROTO_INET */
+#endif
diff --git a/net/netfilter/nft_chain_route.c b/net/netfilter/nft_chain_route.c
new file mode 100644
index 000000000000..925db0dce48d
--- /dev/null
+++ b/net/netfilter/nft_chain_route.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#ifdef CONFIG_NF_TABLES_IPV4
+static unsigned int nf_route_table_hook4(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ const struct iphdr *iph;
+ struct nft_pktinfo pkt;
+ __be32 saddr, daddr;
+ unsigned int ret;
+ u32 mark;
+ int err;
+ u8 tos;
+
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv4(&pkt);
+
+ mark = skb->mark;
+ iph = ip_hdr(skb);
+ saddr = iph->saddr;
+ daddr = iph->daddr;
+ tos = iph->tos;
+
+ ret = nft_do_chain(&pkt, priv);
+ if (ret == NF_ACCEPT) {
+ iph = ip_hdr(skb);
+
+ if (iph->saddr != saddr ||
+ iph->daddr != daddr ||
+ skb->mark != mark ||
+ iph->tos != tos) {
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+ }
+ return ret;
+}
+
+static const struct nft_chain_type nft_chain_route_ipv4 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV4,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook4,
+ },
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static unsigned int nf_route_table_hook6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct in6_addr saddr, daddr;
+ struct nft_pktinfo pkt;
+ u32 mark, flowlabel;
+ unsigned int ret;
+ u8 hop_limit;
+ int err;
+
+ nft_set_pktinfo(&pkt, skb, state);
+ nft_set_pktinfo_ipv6(&pkt);
+
+ /* save source/dest address, mark, hoplimit, flowlabel, priority */
+ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
+ memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
+ mark = skb->mark;
+ hop_limit = ipv6_hdr(skb)->hop_limit;
+
+ /* flowlabel and prio (includes version, which shouldn't change either)*/
+ flowlabel = *((u32 *)ipv6_hdr(skb));
+
+ ret = nft_do_chain(&pkt, priv);
+ if (ret == NF_ACCEPT &&
+ (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
+ memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
+ skb->mark != mark ||
+ ipv6_hdr(skb)->hop_limit != hop_limit ||
+ flowlabel != *((u32 *)ipv6_hdr(skb)))) {
+ err = nf_ip6_route_me_harder(state->net, state->sk, skb);
+ if (err < 0)
+ ret = NF_DROP_ERR(err);
+ }
+
+ return ret;
+}
+
+static const struct nft_chain_type nft_chain_route_ipv6 = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_IPV6,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_hook6,
+ },
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_INET
+static unsigned int nf_route_table_inet(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nft_pktinfo pkt;
+
+ switch (state->pf) {
+ case NFPROTO_IPV4:
+ return nf_route_table_hook4(priv, skb, state);
+ case NFPROTO_IPV6:
+ return nf_route_table_hook6(priv, skb, state);
+ default:
+ nft_set_pktinfo(&pkt, skb, state);
+ break;
+ }
+
+ return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_route_inet = {
+ .name = "route",
+ .type = NFT_CHAIN_T_ROUTE,
+ .family = NFPROTO_INET,
+ .hook_mask = (1 << NF_INET_LOCAL_OUT),
+ .hooks = {
+ [NF_INET_LOCAL_OUT] = nf_route_table_inet,
+ },
+};
+#endif
+
+void __init nft_chain_route_init(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_register_chain_type(&nft_chain_route_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_register_chain_type(&nft_chain_route_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_register_chain_type(&nft_chain_route_inet);
+#endif
+}
+
+void __exit nft_chain_route_fini(void)
+{
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_chain_type(&nft_chain_route_ipv6);
+#endif
+#ifdef CONFIG_NF_TABLES_IPV4
+ nft_unregister_chain_type(&nft_chain_route_ipv4);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_chain_type(&nft_chain_route_inet);
+#endif
+}
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 79d48c1d06f4..2605f43737bc 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -13,20 +10,22 @@
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
+#include <linux/if_arp.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_tables.h>
struct nft_cmp_expr {
struct nft_data data;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_cmp_ops op:8;
};
-static void nft_cmp_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_cmp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);
int d;
@@ -44,7 +43,7 @@ static void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_LT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_LTE:
if (d > 0)
goto mismatch;
@@ -52,7 +51,7 @@ static void nft_cmp_eval(const struct nft_expr *expr,
case NFT_CMP_GT:
if (d == 0)
goto mismatch;
- /* fall through */
+ fallthrough;
case NFT_CMP_GTE:
if (d < 0)
goto mismatch;
@@ -74,16 +73,17 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
int err;
- err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
@@ -92,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return 0;
}
-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_cmp_expr *priv = nft_expr_priv(expr);
@@ -110,50 +111,159 @@ nla_put_failure:
return -1;
}
+union nft_cmp_offload_data {
+ u16 val16;
+ u32 val32;
+ u64 val64;
+};
+
+static void nft_payload_n2h(union nft_cmp_offload_data *data,
+ const u8 *val, u32 len)
+{
+ switch (len) {
+ case 2:
+ data->val16 = ntohs(*((__be16 *)val));
+ break;
+ case 4:
+ data->val32 = ntohl(*((__be32 *)val));
+ break;
+ case 8:
+ data->val64 = be64_to_cpu(*((__be64 *)val));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_cmp_expr *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->sreg];
+ union nft_cmp_offload_data _data, _datamask;
+ u8 *mask = (u8 *)&flow->match.mask;
+ u8 *key = (u8 *)&flow->match.key;
+ u8 *data, *datamask;
+
+ if (priv->op != NFT_CMP_EQ || priv->len > reg->len)
+ return -EOPNOTSUPP;
+
+ if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) {
+ nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len);
+ nft_payload_n2h(&_datamask, (u8 *)&reg->mask, reg->len);
+ data = (u8 *)&_data;
+ datamask = (u8 *)&_datamask;
+ } else {
+ data = (u8 *)&priv->data;
+ datamask = (u8 *)&reg->mask;
+ }
+
+ memcpy(key + reg->offset, data, reg->len);
+ memcpy(mask + reg->offset, datamask, reg->len);
+
+ flow->match.dissector.used_keys |= BIT_ULL(reg->key);
+ flow->match.dissector.offset[reg->key] = reg->base_offset;
+
+ if (reg->key == FLOW_DISSECTOR_KEY_META &&
+ reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) &&
+ nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
+ return -EOPNOTSUPP;
+
+ nft_offload_update_dependency(ctx, &priv->data, reg->len);
+
+ return 0;
+}
+
+static int nft_cmp_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp_expr *priv = nft_expr_priv(expr);
+
+ return __nft_cmp_offload(ctx, flow, priv);
+}
+
static const struct nft_expr_ops nft_cmp_ops = {
.type = &nft_cmp_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)),
.eval = nft_cmp_eval,
.init = nft_cmp_init,
.dump = nft_cmp_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp_offload,
};
+/* Calculate the mask for the nft_cmp_fast expression. On big endian the
+ * mask needs to include the *upper* bytes when interpreting that data as
+ * something smaller than the full u32, therefore a cpu_to_le32 is done.
+ */
+static u32 nft_cmp_fast_mask(unsigned int len)
+{
+ __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr,
+ data) * BITS_PER_BYTE - len));
+
+ return (__force u32)mask;
+}
+
static int nft_cmp_fast_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
struct nft_data data;
- u32 mask;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
int err;
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return err;
- priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]);
- err = nft_validate_register_load(priv->sreg, desc.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
if (err < 0)
return err;
desc.len *= BITS_PER_BYTE;
- mask = nft_cmp_fast_mask(desc.len);
- priv->data = data.data[0] & mask;
+ priv->mask = nft_cmp_fast_mask(desc.len);
+ priv->data = data.data[0] & priv->mask;
priv->len = desc.len;
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
return 0;
}
-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
{
const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = {
+ .data = {
+ [0] = priv->data,
+ },
+ },
+ .sreg = priv->sreg,
+ .len = priv->len / BITS_PER_BYTE,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
struct nft_data data;
if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_CMP_OP, htonl(NFT_CMP_EQ)))
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
goto nla_put_failure;
data.data[0] = priv->data;
@@ -172,14 +282,114 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
.eval = NULL, /* inlined */
.init = nft_cmp_fast_init,
.dump = nft_cmp_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp_fast_offload,
+};
+
+static u32 nft_cmp_mask(u32 bitlen)
+{
+ return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen));
+}
+
+static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen)
+{
+ int len = bitlen / BITS_PER_BYTE;
+ int i, words = len / sizeof(u32);
+
+ for (i = 0; i < words; i++) {
+ data->data[i] = 0xffffffff;
+ bitlen -= sizeof(u32) * BITS_PER_BYTE;
+ }
+
+ if (len % sizeof(u32))
+ data->data[i++] = nft_cmp_mask(bitlen);
+
+ for (; i < 4; i++)
+ data->data[i] = 0;
+}
+
+static int nft_cmp16_fast_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data),
+ };
+ int err;
+
+ err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]);
+ if (err < 0)
+ return err;
+
+ err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len);
+ if (err < 0)
+ return err;
+
+ nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE);
+ priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ;
+ priv->len = desc.len;
+
+ return 0;
+}
+
+static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ struct nft_cmp_expr cmp = {
+ .data = priv->data,
+ .sreg = priv->sreg,
+ .len = priv->len,
+ .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ,
+ };
+
+ return __nft_cmp_offload(ctx, flow, &cmp);
+}
+
+static int nft_cmp16_fast_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
+ enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
+
+ if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg))
+ goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op)))
+ goto nla_put_failure;
+
+ if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data,
+ NFT_DATA_VALUE, priv->len) < 0)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+
+const struct nft_expr_ops nft_cmp16_fast_ops = {
+ .type = &nft_cmp_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)),
+ .eval = NULL, /* inlined */
+ .init = nft_cmp16_fast_init,
+ .dump = nft_cmp16_fast_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_cmp16_fast_offload,
};
static const struct nft_expr_ops *
nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
{
- struct nft_data_desc desc;
struct nft_data data;
+ struct nft_data_desc desc = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(data),
+ };
enum nft_cmp_ops op;
+ u8 sreg;
int err;
if (tb[NFTA_CMP_SREG] == NULL ||
@@ -200,23 +410,21 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
return ERR_PTR(-EINVAL);
}
- err = nft_data_init(NULL, &data, sizeof(data), &desc,
- tb[NFTA_CMP_DATA]);
+ err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]);
if (err < 0)
return ERR_PTR(err);
- if (desc.type != NFT_DATA_VALUE) {
- err = -EINVAL;
- goto err1;
- }
-
- if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
- return &nft_cmp_fast_ops;
+ sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG]));
+ if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) {
+ if (desc.len <= sizeof(u32))
+ return &nft_cmp_fast_ops;
+ else if (desc.len <= sizeof(data) &&
+ ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) ||
+ (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0)))
+ return &nft_cmp16_fast_ops;
+ }
return &nft_cmp_ops;
-err1:
- nft_data_release(&data, desc.type);
- return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7334e0b80a5e..72711d62fddf 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* This software has been sponsored by Sophos Astaro <http://www.sophos.com>
*/
@@ -22,19 +19,7 @@
#include <linux/netfilter_bridge/ebtables.h>
#include <linux/netfilter_arp/arp_tables.h>
#include <net/netfilter/nf_tables.h>
-
-struct nft_xt {
- struct list_head head;
- struct nft_expr_ops ops;
- unsigned int refcnt;
-
- /* Unlike other expressions, ops doesn't have static storage duration.
- * nft core assumes they do. We use kfree_rcu so that nft core can
- * can check expr->ops->size even after nft_compat->destroy() frees
- * the nft_xt struct that holds the ops structure.
- */
- struct rcu_head rcu_head;
-};
+#include <net/netfilter/nf_log.h>
/* Used for matches where *info is larger than X byte */
#define NFT_MATCH_LARGE_THRESH 192
@@ -43,17 +28,6 @@ struct nft_xt_match_priv {
void *info;
};
-static bool nft_xt_put(struct nft_xt *xt)
-{
- if (--xt->refcnt == 0) {
- list_del(&xt->head);
- kfree_rcu(xt, rcu_head);
- return true;
- }
-
- return false;
-}
-
static int nft_compat_chain_validate_dependency(const struct nft_ctx *ctx,
const char *tablename)
{
@@ -84,8 +58,13 @@ union nft_entry {
};
static inline void
-nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+nft_compat_set_par(struct xt_action_param *par,
+ const struct nft_pktinfo *pkt,
+ const void *xt, const void *xt_info)
{
+ par->state = pkt->state;
+ par->thoff = nft_thoff(pkt);
+ par->fragoff = pkt->fragoff;
par->target = xt;
par->targinfo = xt_info;
par->hotdrop = false;
@@ -98,13 +77,14 @@ static void nft_target_eval_xt(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -124,13 +104,14 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
void *info = nft_expr_priv(expr);
struct xt_target *target = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
int ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+ nft_compat_set_par(&xt, pkt, target, info);
- ret = target->target(skb, &pkt->xt);
+ ret = target->target(skb, &xt);
- if (pkt->xt.hotdrop)
+ if (xt.hotdrop)
ret = NF_DROP;
switch (ret) {
@@ -154,7 +135,7 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
[NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_TARGET_REV] = { .type = NLA_U32 },
+ [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TARGET_INFO] = { .type = NLA_BINARY },
};
@@ -219,11 +200,12 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1]
static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
{
struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+ u32 l4proto;
u32 flags;
int err;
- err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
- nft_rule_compat_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_RULE_COMPAT_MAX, attr,
+ nft_rule_compat_policy, NULL);
if (err < 0)
return err;
@@ -231,15 +213,32 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return -EINVAL;
flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
- if (flags & ~NFT_RULE_COMPAT_F_MASK)
+ if (flags & NFT_RULE_COMPAT_F_UNUSED ||
+ flags & ~NFT_RULE_COMPAT_F_MASK)
return -EINVAL;
if (flags & NFT_RULE_COMPAT_F_INV)
*inv = true;
- *proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ l4proto = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+ if (l4proto > U16_MAX)
+ return -EINVAL;
+
+ *proto = l4proto;
+
return 0;
}
+static void nft_compat_wait_for_destructors(struct net *net)
+{
+ /* xtables matches or targets can have side effects, e.g.
+ * creation/destruction of /proc files.
+ * The xt ->destroy functions are run asynchronously from
+ * work queue. If we have pending invocations we thus
+ * need to wait for those to finish.
+ */
+ nf_tables_trans_destroy_flush_work(net);
+}
+
static int
nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -248,7 +247,6 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_target *target = expr->ops->data;
struct xt_tgchk_param par;
size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
- struct nft_xt *nft_xt;
u16 proto = 0;
bool inv = false;
union nft_entry e = {};
@@ -264,24 +262,45 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+ nft_compat_wait_for_destructors(ctx->net);
+
ret = xt_check_target(&par, size, proto, inv);
- if (ret < 0)
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ const char *modname = NULL;
+
+ if (strcmp(target->name, "LOG") == 0)
+ modname = "nf_log_syslog";
+ else if (strcmp(target->name, "NFLOG") == 0)
+ modname = "nfnetlink_log";
+
+ if (modname &&
+ nft_request_module(ctx->net, "%s", modname) == -EAGAIN)
+ return -EAGAIN;
+ }
+
return ret;
+ }
/* The standard target cannot be used */
if (!target->target)
return -EINVAL;
- nft_xt = container_of(expr->ops, struct nft_xt, ops);
- nft_xt->refcnt++;
return 0;
}
+static void __nft_mt_tg_destroy(struct module *me, const struct nft_expr *expr)
+{
+ module_put(me);
+ kfree(expr->ops);
+}
+
static void
nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
+ struct module *me = target->me;
struct xt_tgdtor_param par;
par.net = ctx->net;
@@ -291,8 +310,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
if (par.target->destroy != NULL)
par.target->destroy(&par);
- if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
- module_put(target->me);
+ __nft_mt_tg_destroy(me, expr);
}
static int nft_extension_dump_info(struct sk_buff *skb, int attr,
@@ -313,7 +331,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr,
return 0;
}
-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct xt_target *target = expr->ops->data;
void *info = nft_expr_priv(expr);
@@ -331,13 +350,28 @@ nla_put_failure:
}
static int nft_target_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_target *target = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -361,13 +395,14 @@ static void __nft_match_eval(const struct nft_expr *expr,
{
struct xt_match *match = expr->ops->data;
struct sk_buff *skb = pkt->skb;
+ struct xt_action_param xt;
bool ret;
- nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+ nft_compat_set_par(&xt, pkt, match, info);
- ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+ ret = match->match(skb, &xt);
- if (pkt->xt.hotdrop) {
+ if (xt.hotdrop) {
regs->verdict.code = NF_DROP;
return;
}
@@ -400,7 +435,7 @@ static void nft_match_eval(const struct nft_expr *expr,
static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
[NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
- [NFTA_MATCH_REV] = { .type = NLA_U32 },
+ [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_MATCH_INFO] = { .type = NLA_BINARY },
};
@@ -465,7 +500,6 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_match *match = expr->ops->data;
struct xt_mtchk_param par;
size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
- struct nft_xt *nft_xt;
u16 proto = 0;
bool inv = false;
union nft_entry e = {};
@@ -481,13 +515,9 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
- ret = xt_check_match(&par, size, proto, inv);
- if (ret < 0)
- return ret;
+ nft_compat_wait_for_destructors(ctx->net);
- nft_xt = container_of(expr->ops, struct nft_xt, ops);
- nft_xt->refcnt++;
- return 0;
+ return xt_check_match(&par, size, proto, inv);
}
static int
@@ -505,7 +535,7 @@ nft_match_large_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
struct xt_match *m = expr->ops->data;
int ret;
- priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL);
+ priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT);
if (!priv->info)
return -ENOMEM;
@@ -530,8 +560,7 @@ __nft_match_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr,
if (par.match->destroy != NULL)
par.match->destroy(&par);
- if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
- module_put(me);
+ __nft_mt_tg_destroy(me, expr);
}
static void
@@ -566,12 +595,14 @@ nla_put_failure:
return -1;
}
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
return __nft_match_dump(skb, expr, nft_expr_priv(expr));
}
-static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+static int nft_match_large_dump(struct sk_buff *skb,
+ const struct nft_expr *e, bool reset)
{
struct nft_xt_match_priv *priv = nft_expr_priv(e);
@@ -579,13 +610,28 @@ static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
}
static int nft_match_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct xt_match *match = expr->ops->data;
unsigned int hook_mask = 0;
int ret;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
+ ret = nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+ if (ret)
+ return ret;
+
if (nft_is_base_chain(ctx->chain)) {
const struct nft_base_chain *basechain =
nft_base_chain(ctx->chain);
@@ -608,19 +654,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
int rev, int target)
{
struct nlmsghdr *nlh;
- struct nfgenmsg *nfmsg;
unsigned int flags = portid ? NLM_F_MULTI : 0;
event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event);
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
- if (nlh == NULL)
+ nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
+ NFNETLINK_V0, 0);
+ if (!nlh)
goto nlmsg_failure;
- nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = family;
- nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = 0;
-
if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
@@ -635,17 +676,15 @@ nla_put_failure:
return -1;
}
-static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const tb[],
- struct netlink_ext_ack *extack)
+static int nfnl_compat_get_rcu(struct sk_buff *skb,
+ const struct nfnl_info *info,
+ const struct nlattr * const tb[])
{
+ u8 family = info->nfmsg->nfgen_family;
+ const char *name, *fmt;
+ struct sk_buff *skb2;
int ret = 0, target;
- struct nfgenmsg *nfmsg;
- const char *fmt;
- const char *name;
u32 rev;
- struct sk_buff *skb2;
if (tb[NFTA_COMPAT_NAME] == NULL ||
tb[NFTA_COMPAT_REV] == NULL ||
@@ -656,9 +695,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
- nfmsg = nlmsg_data(nlh);
-
- switch(nfmsg->nfgen_family) {
+ switch(family) {
case AF_INET:
fmt = "ipt_%s";
break;
@@ -672,8 +709,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
fmt = "arpt_%s";
break;
default:
- pr_err("nft_compat: unsupported protocol %d\n",
- nfmsg->nfgen_family);
+ pr_err("nft_compat: unsupported protocol %d\n", family);
return -EINVAL;
}
@@ -681,9 +717,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
return -EINVAL;
rcu_read_unlock();
- try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
- rev, target, &ret),
- fmt, name);
+ try_then_request_module(xt_find_revision(family, name, rev, target, &ret),
+ fmt, name);
if (ret < 0)
goto out_put;
@@ -695,36 +730,36 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl,
/* include the best revision for this extension in the message */
if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq,
- NFNL_MSG_TYPE(nlh->nlmsg_type),
+ info->nlh->nlmsg_seq,
+ NFNL_MSG_TYPE(info->nlh->nlmsg_type),
NFNL_MSG_COMPAT_GET,
- nfmsg->nfgen_family,
- name, ret, target) <= 0) {
+ family, name, ret, target) <= 0) {
kfree_skb(skb2);
goto out_put;
}
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
- MSG_DONTWAIT);
- if (ret > 0)
- ret = 0;
+ ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
out_put:
rcu_read_lock();
module_put(THIS_MODULE);
- return ret == -EAGAIN ? -ENOBUFS : ret;
+
+ return ret;
}
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
[NFTA_COMPAT_NAME] = { .type = NLA_NUL_STRING,
.len = NFT_COMPAT_NAME_MAX-1 },
- [NFTA_COMPAT_REV] = { .type = NLA_U32 },
+ [NFTA_COMPAT_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_COMPAT_TYPE] = { .type = NLA_U32 },
};
static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
- [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu,
- .attr_count = NFTA_COMPAT_MAX,
- .policy = nfnl_compat_policy_get },
+ [NFNL_MSG_COMPAT_GET] = {
+ .call = nfnl_compat_get_rcu,
+ .type = NFNL_CB_RCU,
+ .attr_count = NFTA_COMPAT_MAX,
+ .policy = nfnl_compat_policy_get
+ },
};
static const struct nfnetlink_subsystem nfnl_compat_subsys = {
@@ -734,22 +769,21 @@ static const struct nfnetlink_subsystem nfnl_compat_subsys = {
.cb = nfnl_nft_compat_cb,
};
-static LIST_HEAD(nft_match_list);
-
static struct nft_expr_type nft_match_type;
-static bool nft_match_cmp(const struct xt_match *match,
- const char *name, u32 rev, u32 family)
+static bool nft_match_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
{
- return strcmp(match->name, name) == 0 && match->revision == rev &&
- (match->family == NFPROTO_UNSPEC || match->family == family);
+ const struct xt_match *match = expr->ops->data;
+
+ return strcmp(match->name, "comment") == 0;
}
static const struct nft_expr_ops *
nft_match_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
- struct nft_xt *nft_match;
+ struct nft_expr_ops *ops;
struct xt_match *match;
unsigned int matchsize;
char *mt_name;
@@ -765,14 +799,6 @@ nft_match_select_ops(const struct nft_ctx *ctx,
rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
family = ctx->family;
- /* Re-use the existing match if it's already loaded. */
- list_for_each_entry(nft_match, &nft_match_list, head) {
- struct xt_match *match = nft_match->ops.data;
-
- if (nft_match_cmp(match, mt_name, rev, family))
- return &nft_match->ops;
- }
-
match = xt_request_find_match(family, mt_name, rev);
if (IS_ERR(match))
return ERR_PTR(-ENOENT);
@@ -782,66 +808,63 @@ nft_match_select_ops(const struct nft_ctx *ctx,
goto err;
}
- /* This is the first time we use this match, allocate operations */
- nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
- if (nft_match == NULL) {
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
err = -ENOMEM;
goto err;
}
- nft_match->refcnt = 0;
- nft_match->ops.type = &nft_match_type;
- nft_match->ops.eval = nft_match_eval;
- nft_match->ops.init = nft_match_init;
- nft_match->ops.destroy = nft_match_destroy;
- nft_match->ops.dump = nft_match_dump;
- nft_match->ops.validate = nft_match_validate;
- nft_match->ops.data = match;
+ ops->type = &nft_match_type;
+ ops->eval = nft_match_eval;
+ ops->init = nft_match_init;
+ ops->destroy = nft_match_destroy;
+ ops->dump = nft_match_dump;
+ ops->validate = nft_match_validate;
+ ops->data = match;
+ ops->reduce = nft_match_reduce;
matchsize = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize));
if (matchsize > NFT_MATCH_LARGE_THRESH) {
matchsize = NFT_EXPR_SIZE(sizeof(struct nft_xt_match_priv));
- nft_match->ops.eval = nft_match_large_eval;
- nft_match->ops.init = nft_match_large_init;
- nft_match->ops.destroy = nft_match_large_destroy;
- nft_match->ops.dump = nft_match_large_dump;
+ ops->eval = nft_match_large_eval;
+ ops->init = nft_match_large_init;
+ ops->destroy = nft_match_large_destroy;
+ ops->dump = nft_match_large_dump;
}
- nft_match->ops.size = matchsize;
-
- list_add(&nft_match->head, &nft_match_list);
+ ops->size = matchsize;
- return &nft_match->ops;
+ return ops;
err:
module_put(match->me);
return ERR_PTR(err);
}
+static void nft_match_release_ops(const struct nft_expr_ops *ops)
+{
+ struct xt_match *match = ops->data;
+
+ module_put(match->me);
+ kfree(ops);
+}
+
static struct nft_expr_type nft_match_type __read_mostly = {
.name = "match",
.select_ops = nft_match_select_ops,
+ .release_ops = nft_match_release_ops,
.policy = nft_match_policy,
.maxattr = NFTA_MATCH_MAX,
.owner = THIS_MODULE,
};
-static LIST_HEAD(nft_target_list);
-
static struct nft_expr_type nft_target_type;
-static bool nft_target_cmp(const struct xt_target *tg,
- const char *name, u32 rev, u32 family)
-{
- return strcmp(tg->name, name) == 0 && tg->revision == rev &&
- (tg->family == NFPROTO_UNSPEC || tg->family == family);
-}
-
static const struct nft_expr_ops *
nft_target_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
- struct nft_xt *nft_target;
+ struct nft_expr_ops *ops;
struct xt_target *target;
char *tg_name;
u32 rev, family;
@@ -861,17 +884,6 @@ nft_target_select_ops(const struct nft_ctx *ctx,
strcmp(tg_name, "standard") == 0)
return ERR_PTR(-EINVAL);
- /* Re-use the existing target if it's already loaded. */
- list_for_each_entry(nft_target, &nft_target_list, head) {
- struct xt_target *target = nft_target->ops.data;
-
- if (!target->target)
- continue;
-
- if (nft_target_cmp(target, tg_name, rev, family))
- return &nft_target->ops;
- }
-
target = xt_request_find_target(family, tg_name, rev);
if (IS_ERR(target))
return ERR_PTR(-ENOENT);
@@ -886,38 +898,44 @@ nft_target_select_ops(const struct nft_ctx *ctx,
goto err;
}
- /* This is the first time we use this target, allocate operations */
- nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
- if (nft_target == NULL) {
+ ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT);
+ if (!ops) {
err = -ENOMEM;
goto err;
}
- nft_target->refcnt = 0;
- nft_target->ops.type = &nft_target_type;
- nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
- nft_target->ops.init = nft_target_init;
- nft_target->ops.destroy = nft_target_destroy;
- nft_target->ops.dump = nft_target_dump;
- nft_target->ops.validate = nft_target_validate;
- nft_target->ops.data = target;
+ ops->type = &nft_target_type;
+ ops->size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize));
+ ops->init = nft_target_init;
+ ops->destroy = nft_target_destroy;
+ ops->dump = nft_target_dump;
+ ops->validate = nft_target_validate;
+ ops->data = target;
+ ops->reduce = NFT_REDUCE_READONLY;
if (family == NFPROTO_BRIDGE)
- nft_target->ops.eval = nft_target_eval_bridge;
+ ops->eval = nft_target_eval_bridge;
else
- nft_target->ops.eval = nft_target_eval_xt;
-
- list_add(&nft_target->head, &nft_target_list);
+ ops->eval = nft_target_eval_xt;
- return &nft_target->ops;
+ return ops;
err:
module_put(target->me);
return ERR_PTR(err);
}
+static void nft_target_release_ops(const struct nft_expr_ops *ops)
+{
+ struct xt_target *target = ops->data;
+
+ module_put(target->me);
+ kfree(ops);
+}
+
static struct nft_expr_type nft_target_type __read_mostly = {
.name = "target",
.select_ops = nft_target_select_ops,
+ .release_ops = nft_target_release_ops,
.policy = nft_target_policy,
.maxattr = NFTA_TARGET_MAX,
.owner = THIS_MODULE,
@@ -942,7 +960,6 @@ static int __init nft_compat_module_init(void)
}
return ret;
-
err_target:
nft_unregister_expr(&nft_target_type);
err_match:
@@ -952,32 +969,6 @@ err_match:
static void __exit nft_compat_module_exit(void)
{
- struct nft_xt *xt, *next;
-
- /* list should be empty here, it can be non-empty only in case there
- * was an error that caused nft_xt expr to not be initialized fully
- * and noone else requested the same expression later.
- *
- * In this case, the lists contain 0-refcount entries that still
- * hold module reference.
- */
- list_for_each_entry_safe(xt, next, &nft_target_list, head) {
- struct xt_target *target = xt->ops.data;
-
- if (WARN_ON_ONCE(xt->refcnt))
- continue;
- module_put(target->me);
- kfree(xt);
- }
-
- list_for_each_entry_safe(xt, next, &nft_match_list, head) {
- struct xt_match *match = xt->ops.data;
-
- if (WARN_ON_ONCE(xt->refcnt))
- continue;
- module_put(match->me);
- kfree(xt);
- }
nfnetlink_subsys_unregister(&nfnl_compat_subsys);
nft_unregister_expr(&nft_target_type);
nft_unregister_expr(&nft_match_type);
@@ -992,3 +983,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("match");
MODULE_ALIAS_NFT_EXPR("target");
+MODULE_DESCRIPTION("x_tables over nftables support");
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index af1497ab9464..657764774a2d 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,7 +14,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- struct nf_conncount_list list;
+ struct nf_conncount_list *list;
u32 limit;
bool invert;
};
@@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
const struct nft_pktinfo *pkt,
const struct nft_set_ext *ext)
{
- const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
- const struct nf_conntrack_tuple *tuple_ptr;
- struct nf_conntrack_tuple tuple;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn *ct;
unsigned int count;
+ int err;
- tuple_ptr = &tuple;
-
- ct = nf_ct_get(pkt->skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb),
- nft_pf(pkt), nft_net(pkt), &tuple)) {
- regs->verdict.code = NF_DROP;
- return;
- }
-
- if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) {
- regs->verdict.code = NF_DROP;
- return;
+ err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list);
+ if (err) {
+ if (err == -EEXIST) {
+ /* Call gc to update the list count if any connection has
+ * been closed already. This is useful for softlimit
+ * connections like limiting bandwidth based on a number
+ * of open connections.
+ */
+ nf_conncount_gc_list(nft_net(pkt), priv->list);
+ } else {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
}
- count = priv->list.count;
+ count = READ_ONCE(priv->list->count);
- if ((count > priv->limit) ^ priv->invert) {
+ if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -62,6 +56,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
{
bool invert = false;
u32 flags, limit;
+ int err;
if (!tb[NFTA_CONNLIMIT_COUNT])
return -EINVAL;
@@ -76,18 +71,31 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- nf_conncount_list_init(&priv->list);
+ priv->list = kmalloc(sizeof(*priv->list), GFP_KERNEL_ACCOUNT);
+ if (!priv->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv->list);
priv->limit = limit;
priv->invert = invert;
- return nf_ct_netns_get(ctx->net, ctx->family);
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err < 0)
+ goto err_netns;
+
+ return 0;
+err_netns:
+ kfree(priv->list);
+
+ return err;
}
static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -123,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx,
return nft_connlimit_do_init(ctx, tb, priv);
}
+static void nft_connlimit_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_connlimit *newpriv = nft_obj_data(newobj);
+ struct nft_connlimit *priv = nft_obj_data(obj);
+
+ WRITE_ONCE(priv->limit, newpriv->limit);
+ WRITE_ONCE(priv->invert, newpriv->invert);
+}
+
static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx,
struct nft_object *obj)
{
@@ -152,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = {
.init = nft_connlimit_obj_init,
.destroy = nft_connlimit_obj_destroy,
.dump = nft_connlimit_obj_dump,
+ .update = nft_connlimit_obj_update,
};
static struct nft_object_type nft_connlimit_obj_type __read_mostly = {
@@ -171,7 +190,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr,
nft_connlimit_do_eval(priv, regs, pkt, NULL);
}
-static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_connlimit_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
@@ -195,12 +215,16 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx,
nft_connlimit_do_destroy(ctx, priv);
}
-static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- nf_conncount_list_init(&priv_dst->list);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp);
+ if (!priv_dst->list)
+ return -ENOMEM;
+
+ nf_conncount_list_init(priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -212,14 +236,15 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->list);
+ nf_conncount_cache_free(priv->list);
+ kfree(priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- return nf_conncount_gc_list(net, &priv->list);
+ return nf_conncount_gc_list(net, priv->list);
}
static struct nft_expr_type nft_connlimit_type;
@@ -233,6 +258,7 @@ static const struct nft_expr_ops nft_connlimit_ops = {
.destroy_clone = nft_connlimit_destroy_clone,
.dump = nft_connlimit_dump,
.gc = nft_connlimit_gc,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_connlimit_type __read_mostly = {
@@ -275,3 +301,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso");
MODULE_ALIAS_NFT_EXPR("connlimit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CONNLIMIT);
+MODULE_DESCRIPTION("nftables connlimit rule support");
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index a61d7edfc290..cc7325329496 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -1,23 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
+ u64_stats_t bytes;
+ u64_stats_t packets;
+};
+
+struct nft_counter_tot {
s64 bytes;
s64 packets;
};
@@ -26,25 +30,24 @@ struct nft_counter_percpu_priv {
struct nft_counter __percpu *counter;
};
-static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync);
static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- write_seqcount_begin(myseq);
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->bytes, pkt->skb->len);
+ u64_stats_inc(&this_cpu->packets);
+ u64_stats_update_end(nft_sync);
- this_cpu->bytes += pkt->skb->len;
- this_cpu->packets++;
-
- write_seqcount_end(myseq);
local_bh_enable();
}
@@ -63,21 +66,20 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- cpu_stats = alloc_percpu(struct nft_counter);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_KERNEL_ACCOUNT);
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu = raw_cpu_ptr(cpu_stats);
if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ u64_stats_set(&this_cpu->packets,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])));
}
if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ u64_stats_set(&this_cpu->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])));
}
- preempt_enable();
+
priv->counter = cpu_stats;
return 0;
}
@@ -104,36 +106,42 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
nft_counter_do_destroy(priv);
}
-static void nft_counter_reset(struct nft_counter_percpu_priv __percpu *priv,
- struct nft_counter *total)
+static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
+ struct nft_counter_tot *total)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- this_cpu->packets -= total->packets;
- this_cpu->bytes -= total->bytes;
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, -total->packets);
+ u64_stats_add(&this_cpu->bytes, -total->bytes);
+ u64_stats_update_end(nft_sync);
+
local_bh_enable();
}
static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
struct nft_counter *this_cpu;
- const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu);
+
this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = read_seqcount_begin(myseq);
- bytes = this_cpu->bytes;
- packets = this_cpu->packets;
- } while (read_seqcount_retry(myseq, seq));
+ seq = u64_stats_fetch_begin(nft_sync);
+ bytes = u64_stats_read(&this_cpu->bytes);
+ packets = u64_stats_read(&this_cpu->packets);
+ } while (u64_stats_fetch_retry(nft_sync, seq));
total->bytes += bytes;
total->packets += packets;
@@ -144,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb,
struct nft_counter_percpu_priv *priv,
bool reset)
{
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
@@ -176,7 +184,7 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
[NFTA_COUNTER_BYTES] = { .type = NLA_U64 },
};
-static struct nft_object_type nft_counter_obj_type;
+struct nft_object_type nft_counter_obj_type;
static const struct nft_object_ops nft_counter_obj_ops = {
.type = &nft_counter_obj_type,
.size = sizeof(struct nft_counter_percpu_priv),
@@ -186,7 +194,7 @@ static const struct nft_object_ops nft_counter_obj_ops = {
.dump = nft_counter_obj_dump,
};
-static struct nft_object_type nft_counter_obj_type __read_mostly = {
+struct nft_object_type nft_counter_obj_type __read_mostly = {
.type = NFT_OBJECT_COUNTER,
.ops = &nft_counter_obj_ops,
.maxattr = NFTA_COUNTER_MAX,
@@ -194,20 +202,20 @@ static struct nft_object_type nft_counter_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
-static void nft_counter_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
nft_counter_do_eval(priv, regs, pkt);
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- return nft_counter_do_dump(skb, priv, false);
+ return nft_counter_do_dump(skb, priv, reset);
}
static int nft_counter_init(const struct nft_ctx *ctx,
@@ -227,31 +235,63 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
nft_counter_do_destroy(priv);
}
-static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
- cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp);
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->packets = total.packets;
- this_cpu->bytes = total.bytes;
- preempt_enable();
+ this_cpu = raw_cpu_ptr(cpu_stats);
+ u64_stats_set(&this_cpu->packets, total.packets);
+ u64_stats_set(&this_cpu->bytes, total.bytes);
priv_clone->counter = cpu_stats;
return 0;
}
-static struct nft_expr_type nft_counter_type;
+static int nft_counter_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ /* No specific offload action is needed, but report success. */
+ return 0;
+}
+
+static void nft_counter_offload_stats(struct nft_expr *expr,
+ const struct flow_stats *stats)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct u64_stats_sync *nft_sync;
+ struct nft_counter *this_cpu;
+
+ local_bh_disable();
+ this_cpu = this_cpu_ptr(priv->counter);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, stats->pkts);
+ u64_stats_add(&this_cpu->bytes, stats->bytes);
+ u64_stats_update_end(nft_sync);
+ local_bh_enable();
+}
+
+void nft_counter_init_seqcount(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu));
+}
+
+struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)),
@@ -261,9 +301,12 @@ static const struct nft_expr_ops nft_counter_ops = {
.destroy_clone = nft_counter_destroy,
.dump = nft_counter_dump,
.clone = nft_counter_clone,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_counter_offload,
+ .offload_stats = nft_counter_offload_stats,
};
-static struct nft_expr_type nft_counter_type __read_mostly = {
+struct nft_expr_type nft_counter_type __read_mostly = {
.name = "counter",
.ops = &nft_counter_ops,
.policy = nft_counter_policy,
@@ -271,38 +314,3 @@ static struct nft_expr_type nft_counter_type __read_mostly = {
.flags = NFT_EXPR_STATEFUL,
.owner = THIS_MODULE,
};
-
-static int __init nft_counter_module_init(void)
-{
- int cpu, err;
-
- for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
-
- err = nft_register_obj(&nft_counter_obj_type);
- if (err < 0)
- return err;
-
- err = nft_register_expr(&nft_counter_type);
- if (err < 0)
- goto err1;
-
- return 0;
-err1:
- nft_unregister_obj(&nft_counter_obj_type);
- return err;
-}
-
-static void __exit nft_counter_module_exit(void)
-{
- nft_unregister_expr(&nft_counter_type);
- nft_unregister_obj(&nft_counter_obj_type);
-}
-
-module_init(nft_counter_module_init);
-module_exit(nft_counter_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_EXPR("counter");
-MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_COUNTER);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 586627c361df..6f2ae7cad731 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -15,7 +12,7 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_tuple.h>
@@ -24,15 +21,8 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
-
-struct nft_ct {
- enum nft_ct_keys key:8;
- enum ip_conntrack_dir dir:8;
- union {
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
- };
-};
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
struct nft_ct_helper_obj {
struct nf_conntrack_helper *helper4;
@@ -43,6 +33,7 @@ struct nft_ct_helper_obj {
#ifdef CONFIG_NF_CONNTRACK_ZONES
static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
+static DEFINE_MUTEX(nft_ct_pcpu_mutex);
#endif
static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
@@ -98,7 +89,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- *dest = ct->mark;
+ *dest = READ_ONCE(ct->mark);
return;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
@@ -118,7 +109,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
helper = rcu_dereference(help->helper);
if (helper == NULL)
goto err;
- strncpy((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
+ strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN);
return;
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS: {
@@ -131,7 +122,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
}
#endif
- case NFT_CT_BYTES: /* fallthrough */
+ case NFT_CT_BYTES:
case NFT_CT_PKTS: {
const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
u64 count = 0;
@@ -178,6 +169,9 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
}
#endif
+ case NFT_CT_ID:
+ *dest = nf_ct_get_id(ct);
+ return;
default:
break;
}
@@ -201,12 +195,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
case NFT_CT_SRC_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->src.u3.ip;
+ *dest = (__force __u32)tuple->src.u3.ip;
return;
case NFT_CT_DST_IP:
if (nf_ct_l3num(ct) != NFPROTO_IPV4)
goto err;
- *dest = tuple->dst.u3.ip;
+ *dest = (__force __u32)tuple->dst.u3.ip;
return;
case NFT_CT_SRC_IP6:
if (nf_ct_l3num(ct) != NFPROTO_IPV6)
@@ -237,6 +231,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
enum ip_conntrack_info ctinfo;
u16 value = nft_reg_load16(&regs->data[priv->sreg]);
struct nf_conn *ct;
+ int oldcnt;
ct = nf_ct_get(skb, &ctinfo);
if (ct) /* already tracked */
@@ -257,18 +252,22 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(atomic_read(&ct->ct_general.use) == 1)) {
+ __refcount_inc(&ct->ct_general.use, &oldcnt);
+ if (likely(oldcnt == 1)) {
nf_ct_zone_add(ct, &zone);
} else {
- /* previous skb got queued to userspace */
+ refcount_dec(&ct->ct_general.use);
+ /* previous skb got queued to userspace, allocate temporary
+ * one until percpu template can be reused.
+ */
ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC);
if (!ct) {
regs->verdict.code = NF_DROP;
return;
}
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
}
- atomic_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
}
#endif
@@ -292,8 +291,8 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_MARK
case NFT_CT_MARK:
- if (ct->mark != value) {
- ct->mark = value;
+ if (READ_ONCE(ct->mark) != value) {
+ WRITE_ONCE(ct->mark, value);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -337,7 +336,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = {
[NFTA_CT_DREG] = { .type = NLA_U32 },
- [NFTA_CT_KEY] = { .type = NLA_U32 },
+ [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_CT_DIRECTION] = { .type = NLA_U8 },
[NFTA_CT_SREG] = { .type = NLA_U32 },
};
@@ -373,7 +372,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
return false;
}
- atomic_set(&tmp->ct_general.use, 1);
+ __set_bit(IPS_CONFIRMED_BIT, &tmp->status);
per_cpu(nft_ct_pcpu_template, cpu) = tmp;
}
@@ -381,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void)
}
#endif
+static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+ if (priv->key == NFT_CT_LABELS)
+ nf_connlabels_put(ctx->net);
+#endif
+}
+
static int nft_ct_get_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -415,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DIRECTION] != NULL)
return -EINVAL;
len = NF_CT_LABELS_MAX_SIZE;
+
+ err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
+ if (err)
+ return err;
break;
#endif
case NFT_CT_HELPER:
@@ -437,12 +448,12 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
switch (ctx->family) {
case NFPROTO_IPV4:
- len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ len = sizeof_field(struct nf_conntrack_tuple,
src.u3.ip);
break;
case NFPROTO_IPV6:
case NFPROTO_INET:
- len = FIELD_SIZEOF(struct nf_conntrack_tuple,
+ len = sizeof_field(struct nf_conntrack_tuple,
src.u3.ip6);
break;
default:
@@ -454,20 +465,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip);
break;
case NFT_CT_SRC_IP6:
case NFT_CT_DST_IP6:
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6);
break;
case NFT_CT_PROTO_SRC:
case NFT_CT_PROTO_DST:
if (tb[NFTA_CT_DIRECTION] == NULL)
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all);
+ len = sizeof_field(struct nf_conntrack_tuple, src.u.all);
break;
case NFT_CT_BYTES:
case NFT_CT_PKTS:
@@ -479,6 +490,12 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
len = sizeof(u16);
break;
#endif
+ case NFT_CT_ID:
+ if (tb[NFTA_CT_DIRECTION])
+ return -EINVAL;
+
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
@@ -490,19 +507,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
case IP_CT_DIR_REPLY:
break;
default:
- return -EINVAL;
+ err = -EINVAL;
+ goto err;
}
}
- priv->dreg = nft_parse_register(tb[NFTA_CT_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
if (err < 0)
- return err;
+ goto err;
err = nf_ct_netns_get(ctx->net, ctx->family);
if (err < 0)
- return err;
+ goto err;
if (priv->key == NFT_CT_BYTES ||
priv->key == NFT_CT_PKTS ||
@@ -510,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
nf_ct_set_acct(ctx->net, true);
return 0;
+err:
+ __nft_ct_get_destroy(ctx, priv);
+ return err;
}
static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
@@ -522,8 +543,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
+ mutex_lock(&nft_ct_pcpu_mutex);
if (--nft_ct_pcpu_template_refcnt == 0)
nft_ct_tmpl_put_pcpu();
+ mutex_unlock(&nft_ct_pcpu_mutex);
+ break;
#endif
default:
break;
@@ -545,7 +569,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
case NFT_CT_MARK:
if (tb[NFTA_CT_DIRECTION])
return -EINVAL;
- len = FIELD_SIZEOF(struct nf_conn, mark);
+ len = sizeof_field(struct nf_conn, mark);
break;
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
@@ -560,9 +584,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
- if (!nft_ct_tmpl_alloc_pcpu())
+ mutex_lock(&nft_ct_pcpu_mutex);
+ if (!nft_ct_tmpl_alloc_pcpu()) {
+ mutex_unlock(&nft_ct_pcpu_mutex);
return -ENOMEM;
+ }
nft_ct_pcpu_template_refcnt++;
+ mutex_unlock(&nft_ct_pcpu_mutex);
len = sizeof(u16);
break;
#endif
@@ -596,8 +624,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
}
}
- priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len);
if (err < 0)
goto err1;
@@ -615,6 +643,9 @@ err1:
static void nft_ct_get_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
+ struct nft_ct *priv = nft_expr_priv(expr);
+
+ __nft_ct_get_destroy(ctx, priv);
nf_ct_netns_put(ctx->net, ctx->family);
}
@@ -627,7 +658,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -666,7 +698,31 @@ nla_put_failure:
return -1;
}
-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static bool nft_ct_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ const struct nft_ct *ct;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ ct = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != ct->key) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_ct_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ct *priv = nft_expr_priv(expr);
@@ -699,8 +755,39 @@ static const struct nft_expr_ops nft_ct_get_ops = {
.init = nft_ct_get_init,
.destroy = nft_ct_get_destroy,
.dump = nft_ct_get_dump,
+ .reduce = nft_ct_get_reduce,
};
+static bool nft_ct_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_ct_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static const struct nft_expr_ops nft_ct_get_fast_ops = {
+ .type = &nft_ct_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
+ .eval = nft_ct_get_fast_eval,
+ .init = nft_ct_get_init,
+ .destroy = nft_ct_get_destroy,
+ .dump = nft_ct_get_dump,
+ .reduce = nft_ct_set_reduce,
+};
+#endif
+
static const struct nft_expr_ops nft_ct_set_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@ -708,6 +795,7 @@ static const struct nft_expr_ops nft_ct_set_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -718,6 +806,7 @@ static const struct nft_expr_ops nft_ct_set_zone_ops = {
.init = nft_ct_set_init,
.destroy = nft_ct_set_destroy,
.dump = nft_ct_set_dump,
+ .reduce = nft_ct_set_reduce,
};
#endif
@@ -731,8 +820,21 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG])
return ERR_PTR(-EINVAL);
- if (tb[NFTA_CT_DREG])
+ if (tb[NFTA_CT_DREG]) {
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
+
+ switch (k) {
+ case NFT_CT_STATE:
+ case NFT_CT_DIRECTION:
+ case NFT_CT_STATUS:
+ case NFT_CT_MARK:
+ case NFT_CT_SECMARK:
+ return &nft_ct_get_fast_ops;
+ }
+#endif
return &nft_ct_get_ops;
+ }
if (tb[NFTA_CT_SREG]) {
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -774,6 +876,7 @@ static const struct nft_expr_ops nft_notrack_ops = {
.type = &nft_notrack_type,
.size = NFT_EXPR_SIZE(0),
.eval = nft_notrack_eval,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_notrack_type __read_mostly = {
@@ -797,9 +900,11 @@ nft_ct_timeout_parse_policy(void *timeouts,
if (!tb)
return -ENOMEM;
- ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
- attr, l4proto->ctnl_timeout.nla_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb,
+ l4proto->ctnl_timeout.nlattr_max,
+ attr,
+ l4proto->ctnl_timeout.nla_policy,
+ NULL);
if (ret < 0)
goto err;
@@ -846,7 +951,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj,
*/
values = nf_ct_timeout_data(timeout);
if (values)
- nf_ct_refresh(ct, pkt->skb, values[0]);
+ nf_ct_refresh(ct, values[0]);
}
static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
@@ -870,7 +975,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]);
priv->l4proto = l4num;
- l4proto = nf_ct_l4proto_find_get(l4num);
+ l4proto = nf_ct_l4proto_find(l4num);
if (l4proto->l4proto != l4num) {
ret = -EOPNOTSUPP;
@@ -902,7 +1007,6 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
err_free_timeout:
kfree(timeout);
err_proto_put:
- nf_ct_l4proto_put(l4proto);
return ret;
}
@@ -913,7 +1017,6 @@ static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx,
struct nf_ct_timeout *timeout = priv->timeout;
nf_ct_untimeout(ctx->net, timeout);
- nf_ct_l4proto_put(timeout->l4proto);
nf_ct_netns_put(ctx->net, ctx->family);
kfree(priv->timeout);
}
@@ -930,7 +1033,7 @@ static int nft_ct_timeout_obj_dump(struct sk_buff *skb,
nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num)))
return -1;
- nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA | NLA_F_NESTED);
+ nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA);
if (!nest_params)
return -1;
@@ -984,7 +1087,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
if (!priv->l4proto)
return -ENOENT;
- nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
+ nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name));
if (tb[NFTA_CT_HELPER_L3PROTO])
family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO]));
@@ -1007,8 +1110,8 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx,
help6 = nf_conntrack_helper_try_module_get(name, family,
priv->l4proto);
break;
- case NFPROTO_NETDEV: /* fallthrough */
- case NFPROTO_BRIDGE: /* same */
+ case NFPROTO_NETDEV:
+ case NFPROTO_BRIDGE:
case NFPROTO_INET:
help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4,
priv->l4proto);
@@ -1090,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
if (help) {
rcu_assign_pointer(help->helper, to_assign);
set_bit(IPS_HELPER_BIT, &ct->status);
+
+ if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct))
+ if (!nfct_seqadj_ext_add(ct))
+ regs->verdict.code = NF_DROP;
}
}
@@ -1148,6 +1255,158 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = {
.owner = THIS_MODULE,
};
+struct nft_ct_expect_obj {
+ u16 l3num;
+ __be16 dport;
+ u8 l4proto;
+ u8 size;
+ u32 timeout;
+};
+
+static int nft_ct_expect_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (!tb[NFTA_CT_EXPECT_L4PROTO] ||
+ !tb[NFTA_CT_EXPECT_DPORT] ||
+ !tb[NFTA_CT_EXPECT_TIMEOUT] ||
+ !tb[NFTA_CT_EXPECT_SIZE])
+ return -EINVAL;
+
+ priv->l3num = ctx->family;
+ if (tb[NFTA_CT_EXPECT_L3PROTO])
+ priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO]));
+
+ switch (priv->l3num) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET)
+ break;
+
+ return -EINVAL;
+ case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]);
+ switch (priv->l4proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ case IPPROTO_DCCP:
+ case IPPROTO_SCTP:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]);
+ priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]);
+ priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]);
+
+ return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_ct_expect_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+
+ if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) ||
+ nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) ||
+ nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) ||
+ nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size))
+ return -1;
+
+ return 0;
+}
+
+static void nft_ct_expect_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct_expect_obj *priv = nft_obj_data(obj);
+ struct nf_conntrack_expect *exp;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_help *help;
+ enum ip_conntrack_dir dir;
+ u16 l3num = priv->l3num;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ dir = CTINFO2DIR(ctinfo);
+
+ help = nfct_help(ct);
+ if (!help)
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+ if (!help) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ if (l3num == NFPROTO_INET)
+ l3num = nf_ct_l3num(ct);
+
+ exp = nf_ct_expect_alloc(ct);
+ if (exp == NULL) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num,
+ &ct->tuplehash[!dir].tuple.src.u3,
+ &ct->tuplehash[!dir].tuple.dst.u3,
+ priv->l4proto, NULL, &priv->dport);
+ exp->timeout.expires = jiffies + priv->timeout * HZ;
+
+ if (nf_ct_expect_related(exp, 0) != 0)
+ regs->verdict.code = NF_DROP;
+}
+
+static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = {
+ [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 },
+ [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 },
+ [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 },
+ [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 },
+};
+
+static struct nft_object_type nft_ct_expect_obj_type;
+
+static const struct nft_object_ops nft_ct_expect_obj_ops = {
+ .type = &nft_ct_expect_obj_type,
+ .size = sizeof(struct nft_ct_expect_obj),
+ .eval = nft_ct_expect_obj_eval,
+ .init = nft_ct_expect_obj_init,
+ .destroy = nft_ct_expect_obj_destroy,
+ .dump = nft_ct_expect_obj_dump,
+};
+
+static struct nft_object_type nft_ct_expect_obj_type __read_mostly = {
+ .type = NFT_OBJECT_CT_EXPECT,
+ .ops = &nft_ct_expect_obj_ops,
+ .maxattr = NFTA_CT_EXPECT_MAX,
+ .policy = nft_ct_expect_policy,
+ .owner = THIS_MODULE,
+};
+
static int __init nft_ct_module_init(void)
{
int err;
@@ -1165,17 +1424,23 @@ static int __init nft_ct_module_init(void)
err = nft_register_obj(&nft_ct_helper_obj_type);
if (err < 0)
goto err2;
+
+ err = nft_register_obj(&nft_ct_expect_obj_type);
+ if (err < 0)
+ goto err3;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
err = nft_register_obj(&nft_ct_timeout_obj_type);
if (err < 0)
- goto err3;
+ goto err4;
#endif
return 0;
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+err4:
+ nft_unregister_obj(&nft_ct_expect_obj_type);
+#endif
err3:
nft_unregister_obj(&nft_ct_helper_obj_type);
-#endif
err2:
nft_unregister_expr(&nft_notrack_type);
err1:
@@ -1188,6 +1453,7 @@ static void __exit nft_ct_module_exit(void)
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
nft_unregister_obj(&nft_ct_timeout_obj_type);
#endif
+ nft_unregister_obj(&nft_ct_expect_obj_type);
nft_unregister_obj(&nft_ct_helper_obj_type);
nft_unregister_expr(&nft_notrack_type);
nft_unregister_expr(&nft_ct_type);
@@ -1202,3 +1468,5 @@ MODULE_ALIAS_NFT_EXPR("ct");
MODULE_ALIAS_NFT_EXPR("notrack");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER);
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT);
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT);
+MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c
new file mode 100644
index 000000000000..e684c8a91848
--- /dev/null
+++ b/net/netfilter/nft_ct_fast.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#if IS_ENABLED(CONFIG_NFT_CT)
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack.h>
+
+void nft_ct_get_fast_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_ct *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn *ct;
+ unsigned int state;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+
+ switch (priv->key) {
+ case NFT_CT_STATE:
+ if (ct)
+ state = NF_CT_STATE_BIT(ctinfo);
+ else if (ctinfo == IP_CT_UNTRACKED)
+ state = NF_CT_STATE_UNTRACKED_BIT;
+ else
+ state = NF_CT_STATE_INVALID_BIT;
+ *dest = state;
+ return;
+ default:
+ break;
+ }
+
+ if (!ct) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ switch (priv->key) {
+ case NFT_CT_DIRECTION:
+ nft_reg_store8(dest, CTINFO2DIR(ctinfo));
+ return;
+ case NFT_CT_STATUS:
+ *dest = ct->status;
+ return;
+#ifdef CONFIG_NF_CONNTRACK_MARK
+ case NFT_CT_MARK:
+ *dest = ct->mark;
+ return;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+ case NFT_CT_SECMARK:
+ *dest = ct->secmark;
+ return;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ regs->verdict.code = NFT_BREAK;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nft_ct_get_fast_eval);
+#endif
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 15cc62b293d6..0573f96ce079 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -13,10 +10,11 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
struct nft_dup_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_dup_netdev_eval(const struct nft_expr *expr,
@@ -42,11 +40,12 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
-static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_netdev *priv = nft_expr_priv(expr);
@@ -59,6 +58,21 @@ nla_put_failure:
return -1;
}
+static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_dup_netdev *priv = nft_expr_priv(expr);
+ int oif = ctx->regs[priv->sreg_dev].data.data[0];
+
+ return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif);
+}
+
+static bool nft_dup_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
static struct nft_expr_type nft_dup_netdev_type;
static const struct nft_expr_ops nft_dup_netdev_ops = {
.type = &nft_dup_netdev_type,
@@ -66,6 +80,9 @@ static const struct nft_expr_ops nft_dup_netdev_ops = {
.eval = nft_dup_netdev_eval,
.init = nft_dup_netdev_init,
.dump = nft_dup_netdev_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_dup_netdev_offload,
+ .offload_action = nft_dup_netdev_offload_action,
};
static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
@@ -93,3 +110,4 @@ module_exit(nft_dup_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(5, "dup");
+MODULE_DESCRIPTION("nftables netdev packet duplication support");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 07d4efd3d851..7807d8129664 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/kernel.h>
@@ -20,72 +16,91 @@ struct nft_dynset {
struct nft_set *set;
struct nft_set_ext_tmpl tmpl;
enum nft_dynset_ops op:8;
- enum nft_registers sreg_key:8;
- enum nft_registers sreg_data:8;
+ u8 sreg_key;
+ u8 sreg_data;
bool invert;
+ bool expr;
+ u8 num_exprs;
u64 timeout;
- struct nft_expr *expr;
+ struct nft_expr *expr_array[NFT_SET_EXPR_MAX];
struct nft_set_binding binding;
};
-static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
- struct nft_regs *regs)
+static int nft_dynset_expr_setup(const struct nft_dynset *priv,
+ const struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
+ if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0)
+ return -1;
+
+ elem_expr->size += priv->expr_array[i]->ops->size;
+ }
+
+ return 0;
+}
+
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+ const struct nft_expr *expr,
+ struct nft_regs *regs)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set_ext *ext;
+ void *elem_priv;
u64 timeout;
- void *elem;
if (!atomic_add_unless(&set->nelems, 1, set->size))
return NULL;
- timeout = priv->timeout ? : set->timeout;
- elem = nft_set_elem_init(set, &priv->tmpl,
- &regs->data[priv->sreg_key],
- &regs->data[priv->sreg_data],
- timeout, GFP_ATOMIC);
- if (elem == NULL)
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ elem_priv = nft_set_elem_init(set, &priv->tmpl,
+ &regs->data[priv->sreg_key], NULL,
+ &regs->data[priv->sreg_data],
+ timeout, 0, GFP_ATOMIC);
+ if (IS_ERR(elem_priv))
goto err1;
- ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL &&
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ ext = nft_set_elem_ext(set, elem_priv);
+ if (priv->num_exprs && nft_dynset_expr_setup(priv, ext) < 0)
goto err2;
- return elem;
+ return elem_priv;
err2:
- nft_set_elem_destroy(set, elem, false);
+ nft_set_elem_destroy(set, elem_priv, false);
err1:
if (set->size)
atomic_dec(&set->nelems);
return NULL;
}
-static void nft_dynset_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_dynset_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set *set = priv->set;
const struct nft_set_ext *ext;
- const struct nft_expr *sexpr;
u64 timeout;
- if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
- expr, regs, &ext)) {
- sexpr = NULL;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- sexpr = nft_set_ext_expr(ext);
+ if (priv->op == NFT_DYNSET_OP_DELETE) {
+ set->ops->delete(set, &regs->data[priv->sreg_key]);
+ return;
+ }
+ ext = set->ops->update(set, &regs->data[priv->sreg_key], expr, regs);
+ if (ext) {
if (priv->op == NFT_DYNSET_OP_UPDATE &&
- nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
- timeout = priv->timeout ? : set->timeout;
- *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
+ nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
+ READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) {
+ timeout = priv->timeout ? : READ_ONCE(set->timeout);
+ WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout);
}
- if (sexpr != NULL)
- sexpr->ops->eval(sexpr, regs, pkt);
+ nft_set_elem_update_expr(ext, regs, pkt);
if (priv->invert)
regs->verdict.code = NFT_BREAK;
@@ -96,29 +111,66 @@ static void nft_dynset_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static void nft_dynset_ext_add_expr(struct nft_dynset *priv)
+{
+ u8 size = 0;
+ int i;
+
+ for (i = 0; i < priv->num_exprs; i++)
+ size += priv->expr_array[i]->ops->size;
+
+ nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPRESSIONS,
+ sizeof(struct nft_set_elem_expr) + size);
+}
+
+static struct nft_expr *
+nft_dynset_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set,
+ const struct nlattr *attr, int pos)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_set_elem_expr_alloc(ctx, set, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ if (set->exprs[pos] && set->exprs[pos]->ops != expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_dynset_expr;
+ }
+
+ return expr;
+
+err_dynset_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = {
[NFTA_DYNSET_SET_NAME] = { .type = NLA_STRING,
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_DYNSET_SET_ID] = { .type = NLA_U32 },
- [NFTA_DYNSET_OP] = { .type = NLA_U32 },
+ [NFTA_DYNSET_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_DYNSET_SREG_KEY] = { .type = NLA_U32 },
[NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 },
[NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_DYNSET_EXPR] = { .type = NLA_NESTED },
[NFTA_DYNSET_FLAGS] = { .type = NLA_U32 },
+ [NFTA_DYNSET_EXPRESSIONS] = { .type = NLA_NESTED },
};
static int nft_dynset_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ struct nftables_pernet *nft_net = nft_pernet(ctx->net);
struct nft_dynset *priv = nft_expr_priv(expr);
u8 genmask = nft_genmask_next(ctx->net);
struct nft_set *set;
u64 timeout;
- int err;
+ int err, i;
- lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_assert_held(&nft_net->commit_mutex);
if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
tb[NFTA_DYNSET_OP] == NULL ||
@@ -127,11 +179,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (tb[NFTA_DYNSET_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS]));
-
- if (flags & ~NFT_DYNSET_F_INV)
- return -EINVAL;
+ if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR))
+ return -EOPNOTSUPP;
if (flags & NFT_DYNSET_F_INV)
priv->invert = true;
+ if (flags & NFT_DYNSET_F_EXPR)
+ priv->expr = true;
}
set = nft_set_lookup_global(ctx->net, ctx->table,
@@ -140,6 +193,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
+ if (set->flags & NFT_SET_OBJECT)
+ return -EOPNOTSUPP;
+
if (set->ops->update == NULL)
return -EOPNOTSUPP;
@@ -147,81 +203,125 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
return -EBUSY;
priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP]));
- switch (priv->op) {
- case NFT_DYNSET_OP_ADD:
- break;
- case NFT_DYNSET_OP_UPDATE:
- if (!(set->flags & NFT_SET_TIMEOUT))
- return -EOPNOTSUPP;
- break;
- default:
+ if (priv->op > NFT_DYNSET_OP_DELETE)
return -EOPNOTSUPP;
- }
timeout = 0;
if (tb[NFTA_DYNSET_TIMEOUT] != NULL) {
if (!(set->flags & NFT_SET_TIMEOUT))
- return -EINVAL;
- timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
- tb[NFTA_DYNSET_TIMEOUT])));
+ return -EOPNOTSUPP;
+
+ err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout);
+ if (err)
+ return err;
}
- priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]);
- err = nft_validate_register_load(priv->sreg_key, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key,
+ set->klen);
if (err < 0)
return err;
if (tb[NFTA_DYNSET_SREG_DATA] != NULL) {
if (!(set->flags & NFT_SET_MAP))
- return -EINVAL;
+ return -EOPNOTSUPP;
if (set->dtype == NFT_DATA_VERDICT)
return -EOPNOTSUPP;
- priv->sreg_data = nft_parse_register(tb[NFTA_DYNSET_SREG_DATA]);
- err = nft_validate_register_load(priv->sreg_data, set->dlen);
+ err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA],
+ &priv->sreg_data, set->dlen);
if (err < 0)
return err;
} else if (set->flags & NFT_SET_MAP)
return -EINVAL;
- if (tb[NFTA_DYNSET_EXPR] != NULL) {
- if (!(set->flags & NFT_SET_EVAL))
- return -EINVAL;
+ if ((tb[NFTA_DYNSET_EXPR] || tb[NFTA_DYNSET_EXPRESSIONS]) &&
+ !(set->flags & NFT_SET_EVAL))
+ return -EINVAL;
- priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
- if (IS_ERR(priv->expr))
- return PTR_ERR(priv->expr);
+ if (tb[NFTA_DYNSET_EXPR]) {
+ struct nft_expr *dynset_expr;
- err = -EOPNOTSUPP;
- if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err1;
-
- if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
- if (set->flags & NFT_SET_TIMEOUT)
- goto err1;
- if (!set->ops->gc_init)
- goto err1;
- set->ops->gc_init(set);
+ dynset_expr = nft_dynset_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR], 0);
+ if (IS_ERR(dynset_expr))
+ return PTR_ERR(dynset_expr);
+
+ priv->num_exprs++;
+ priv->expr_array[0] = dynset_expr;
+
+ if (set->num_exprs > 1 ||
+ (set->num_exprs == 1 &&
+ dynset_expr->ops != set->exprs[0]->ops)) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ } else if (tb[NFTA_DYNSET_EXPRESSIONS]) {
+ struct nft_expr *dynset_expr;
+ struct nlattr *tmp;
+ int left;
+
+ if (!priv->expr)
+ return -EINVAL;
+
+ i = 0;
+ nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) {
+ if (i == NFT_SET_EXPR_MAX) {
+ err = -E2BIG;
+ goto err_expr_free;
+ }
+ if (nla_type(tmp) != NFTA_LIST_ELEM) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ dynset_expr = nft_dynset_expr_alloc(ctx, set, tmp, i);
+ if (IS_ERR(dynset_expr)) {
+ err = PTR_ERR(dynset_expr);
+ goto err_expr_free;
+ }
+ priv->expr_array[i] = dynset_expr;
+ priv->num_exprs++;
+
+ if (set->num_exprs) {
+ if (i >= set->num_exprs) {
+ err = -EINVAL;
+ goto err_expr_free;
+ }
+ if (dynset_expr->ops != set->exprs[i]->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ }
+ i++;
}
+ if (set->num_exprs && set->num_exprs != i) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
+ } else if (set->num_exprs > 0) {
+ err = nft_set_elem_expr_clone(ctx, set, priv->expr_array);
+ if (err < 0)
+ return err;
+
+ priv->num_exprs = set->num_exprs;
}
nft_set_ext_prepare(&priv->tmpl);
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_KEY, set->klen);
if (set->flags & NFT_SET_MAP)
nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_DATA, set->dlen);
- if (priv->expr != NULL)
- nft_set_ext_add_length(&priv->tmpl, NFT_SET_EXT_EXPR,
- priv->expr->ops->size);
- if (set->flags & NFT_SET_TIMEOUT) {
- if (timeout || set->timeout)
- nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION);
- }
+
+ if (priv->num_exprs)
+ nft_dynset_ext_add_expr(priv);
+
+ if (set->flags & NFT_SET_TIMEOUT &&
+ (timeout || READ_ONCE(set->timeout)))
+ nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT);
priv->timeout = timeout;
err = nf_tables_bind_set(ctx, set, &priv->binding);
if (err < 0)
- goto err1;
+ goto err_expr_free;
if (set->size == 0)
set->size = 0xffff;
@@ -229,43 +329,47 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->set = set;
return 0;
-err1:
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+err_expr_free:
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
return err;
}
-static void nft_dynset_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_dynset_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_dynset *priv = nft_expr_priv(expr);
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+ nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
}
-static void nft_dynset_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_dynset_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_dynset_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_dynset *priv = nft_expr_priv(expr);
+ int i;
- if (priv->expr != NULL)
- nft_expr_destroy(ctx, priv->expr);
+ for (i = 0; i < priv->num_exprs; i++)
+ nft_expr_destroy(ctx, priv->expr_array[i]);
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dynset_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_dynset *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
+ int i;
if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key))
goto nla_put_failure;
@@ -277,11 +381,29 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name))
goto nla_put_failure;
if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT,
- cpu_to_be64(jiffies_to_msecs(priv->timeout)),
+ nf_jiffies64_to_msecs(priv->timeout),
NFTA_DYNSET_PAD))
goto nla_put_failure;
- if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr))
- goto nla_put_failure;
+ if (priv->set->num_exprs == 0) {
+ if (priv->num_exprs == 1) {
+ if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
+ priv->expr_array[0], reset))
+ goto nla_put_failure;
+ } else if (priv->num_exprs > 1) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, NFTA_DYNSET_EXPRESSIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ for (i = 0; i < priv->num_exprs; i++) {
+ if (nft_expr_dump(skb, NFTA_LIST_ELEM,
+ priv->expr_array[i], reset))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nest);
+ }
+ }
if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
@@ -299,6 +421,7 @@ static const struct nft_expr_ops nft_dynset_ops = {
.activate = nft_dynset_activate,
.deactivate = nft_dynset_deactivate,
.dump = nft_dynset_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_dynset_type __read_mostly = {
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a940c9fd9045..7eedf4e3ae9c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -1,18 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/kernel.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/tcp.h>
@@ -22,8 +21,8 @@ struct nft_exthdr {
u8 offset;
u8 len;
u8 op;
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
+ u8 dreg;
+ u8 sreg;
u8 flags;
};
@@ -36,6 +35,14 @@ static unsigned int optlen(const u8 *opt, unsigned int offset)
return opt[offset + 1];
}
+static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len)
+{
+ if (len % NFT_REG32_SIZE)
+ dest[len / NFT_REG32_SIZE] = 0;
+
+ return skb_copy_bits(skb, offset, dest, len);
+}
+
static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -45,17 +52,113 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr,
unsigned int offset = 0;
int err;
+ if (pkt->skb->protocol != htons(ETH_P_IPV6))
+ goto err;
+
err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL);
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = (err >= 0);
+ nft_reg_store8(dest, err >= 0);
+ return;
+ } else if (err < 0) {
+ goto err;
+ }
+ offset += priv->offset;
+
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
+ goto err;
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+/* find the offset to specified option.
+ *
+ * If target header is found, its offset is set in *offset and return option
+ * number. Otherwise, return negative error.
+ *
+ * If the first fragment doesn't contain the End of Options it is considered
+ * invalid.
+ */
+static int ipv4_find_option(struct net *net, struct sk_buff *skb,
+ unsigned int *offset, int target)
+{
+ unsigned char optbuf[sizeof(struct ip_options) + 40];
+ struct ip_options *opt = (struct ip_options *)optbuf;
+ struct iphdr *iph, _iph;
+ bool found = false;
+ __be32 info;
+ int optlen;
+
+ iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
+ if (!iph)
+ return -EBADMSG;
+
+ optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
+ if (optlen <= 0)
+ return -ENOENT;
+
+ memset(opt, 0, sizeof(struct ip_options));
+ /* Copy the options since __ip_options_compile() modifies
+ * the options.
+ */
+ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen))
+ return -EBADMSG;
+ opt->optlen = optlen;
+
+ if (__ip_options_compile(net, opt, NULL, &info))
+ return -EBADMSG;
+
+ switch (target) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if (!opt->srr)
+ break;
+ found = target == IPOPT_SSRR ? opt->is_strictroute :
+ !opt->is_strictroute;
+ if (found)
+ *offset = opt->srr;
+ break;
+ case IPOPT_RR:
+ if (!opt->rr)
+ break;
+ *offset = opt->rr;
+ found = true;
+ break;
+ case IPOPT_RA:
+ if (!opt->router_alert)
+ break;
+ *offset = opt->router_alert;
+ found = true;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return found ? target : -ENOENT;
+}
+
+static void nft_exthdr_ipv4_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct sk_buff *skb = pkt->skb;
+ unsigned int offset;
+ int err;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ goto err;
+
+ err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type);
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ nft_reg_store8(dest, err >= 0);
return;
} else if (err < 0) {
goto err;
}
offset += priv->offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
- if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0)
+ if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0)
goto err;
return;
err:
@@ -68,10 +171,10 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
{
struct tcphdr *tcph;
- if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP)
+ if (pkt->tprot != IPPROTO_TCP || pkt->fragoff)
return NULL;
- tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buffer);
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer);
if (!tcph)
return NULL;
@@ -79,7 +182,7 @@ nft_tcp_header_pointer(const struct nft_pktinfo *pkt,
if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len)
return NULL;
- return skb_header_pointer(pkt->skb, pkt->xt.thoff, *tcphdr_len, buffer);
+ return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer);
}
static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
@@ -109,9 +212,10 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr,
offset = i + priv->offset;
if (priv->flags & NFT_EXTHDR_F_PRESENT) {
- *dest = 1;
+ nft_reg_store8(dest, 1);
} else {
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
memcpy(dest, opt + offset, priv->len);
}
@@ -134,16 +238,19 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
unsigned int i, optl, tcphdr_len, offset;
struct tcphdr *tcph;
u8 *opt;
- u32 src;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
if (!tcph)
- return;
+ goto err;
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto err;
+
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
opt = (u8 *)tcph;
+
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
- u8 octet;
__be16 v16;
__be32 v32;
} old, new;
@@ -154,23 +261,15 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
continue;
if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
- return;
-
- if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len))
- return;
-
- tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
- &tcphdr_len);
- if (!tcph)
- return;
+ goto err;
- src = regs->data[priv->sreg];
offset = i + priv->offset;
switch (priv->len) {
case 2:
- old.v16 = get_unaligned((u16 *)(opt + offset));
- new.v16 = src;
+ old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset));
+ new.v16 = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg]);
switch (priv->type) {
case TCPOPT_MSS:
@@ -183,18 +282,18 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (old.v16 == new.v16)
return;
- put_unaligned(new.v16, (u16*)(opt + offset));
+ put_unaligned(new.v16, (__be16*)(opt + offset));
inet_proto_csum_replace2(&tcph->check, pkt->skb,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = src;
- old.v32 = get_unaligned((u32 *)(opt + offset));
+ new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]);
+ old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
return;
- put_unaligned(new.v32, (u32*)(opt + offset));
+ put_unaligned(new.v32, (__be32*)(opt + offset));
inet_proto_csum_replace4(&tcph->check, pkt->skb,
old.v32, new.v32, false);
break;
@@ -205,15 +304,194 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
return;
}
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int i, tcphdr_len, optl;
+ struct tcphdr *tcph;
+ u8 *opt;
+
+ tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+ if (!tcph)
+ goto err;
+
+ if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len))
+ goto drop;
+
+ tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt));
+ opt = (u8 *)tcph;
+
+ for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+ unsigned int j;
+
+ optl = optlen(opt, i);
+ if (priv->type != opt[i])
+ continue;
+
+ if (i + optl > tcphdr_len)
+ goto drop;
+
+ for (j = 0; j < optl; ++j) {
+ u16 n = TCPOPT_NOP;
+ u16 o = opt[i+j];
+
+ if ((i + j) % 2 == 0) {
+ o <<= 8;
+ n <<= 8;
+ }
+ inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o),
+ htons(n), false);
+ }
+ memset(opt + i, TCPOPT_NOP, optl);
+ return;
+ }
+
+ /* option not found, continue. This allows to do multiple
+ * option removals per rule.
+ */
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+ return;
+drop:
+ /* can't remove, no choice but to drop */
+ regs->verdict.code = NF_DROP;
+}
+
+static void nft_exthdr_sctp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr);
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct sctp_chunkhdr *sch;
+ struct sctp_chunkhdr _sch;
+
+ if (pkt->tprot != IPPROTO_SCTP)
+ goto err;
+
+ do {
+ sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch);
+ if (!sch || !sch->length)
+ break;
+
+ if (sch->type == priv->type) {
+ if (priv->flags & NFT_EXTHDR_F_PRESENT) {
+ nft_reg_store8(dest, true);
+ return;
+ }
+ if (priv->offset + priv->len > ntohs(sch->length) ||
+ offset + ntohs(sch->length) > pkt->skb->len)
+ break;
+
+ if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset,
+ dest, priv->len) < 0)
+ break;
+ return;
+ }
+ offset += SCTP_PAD4(ntohs(sch->length));
+ } while (offset < pkt->skb->len);
+err:
+ if (priv->flags & NFT_EXTHDR_F_PRESENT)
+ nft_reg_store8(dest, false);
+ else
+ regs->verdict.code = NFT_BREAK;
}
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static void nft_exthdr_dccp_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ unsigned int thoff, dataoff, optoff, optlen, i;
+ u32 *dest = &regs->data[priv->dreg];
+ const struct dccp_hdr *dh;
+ struct dccp_hdr _dh;
+
+ if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff)
+ goto err;
+
+ thoff = nft_thoff(pkt);
+
+ dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh);
+ if (!dh)
+ goto err;
+
+ dataoff = dh->dccph_doff * sizeof(u32);
+ optoff = __dccp_hdr_len(dh);
+ if (dataoff <= optoff)
+ goto err;
+
+ optlen = dataoff - optoff;
+
+ for (i = 0; i < optlen; ) {
+ /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in
+ * the length; the remaining options are at least 2B long. In
+ * all cases, the first byte contains the option type. In
+ * multi-byte options, the second byte contains the option
+ * length, which must be at least two: 1 for the type plus 1 for
+ * the length plus 0-253 for any following option data. We
+ * aren't interested in the option data, only the type and the
+ * length, so we don't need to read more than two bytes at a
+ * time.
+ */
+ unsigned int buflen = optlen - i;
+ u8 buf[2], *bufp;
+ u8 type, len;
+
+ if (buflen > sizeof(buf))
+ buflen = sizeof(buf);
+
+ bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen,
+ &buf);
+ if (!bufp)
+ goto err;
+
+ type = bufp[0];
+
+ if (type == priv->type) {
+ nft_reg_store8(dest, 1);
+ return;
+ }
+
+ if (type <= DCCPO_MAX_RESERVED) {
+ i++;
+ continue;
+ }
+
+ if (buflen < 2)
+ goto err;
+
+ len = bufp[1];
+
+ if (len < 2)
+ goto err;
+
+ i += len;
+ }
+
+err:
+ *dest = 0;
+}
+#endif
+
static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
[NFTA_EXTHDR_DREG] = { .type = NLA_U32 },
[NFTA_EXTHDR_TYPE] = { .type = NLA_U8 },
[NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 },
- [NFTA_EXTHDR_LEN] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 },
- [NFTA_EXTHDR_OP] = { .type = NLA_U32 },
+ [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_EXTHDR_SREG] = { .type = NLA_U32 },
};
@@ -257,12 +535,12 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
@@ -307,12 +585,74 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
priv->offset = offset;
priv->len = len;
- priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]);
priv->flags = flags;
priv->op = op;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg,
+ priv->len);
+}
+
+static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_EXTHDR_SREG] ||
+ tb[NFTA_EXTHDR_DREG] ||
+ tb[NFTA_EXTHDR_FLAGS] ||
+ tb[NFTA_EXTHDR_OFFSET] ||
+ tb[NFTA_EXTHDR_LEN])
+ return -EINVAL;
+
+ if (!tb[NFTA_EXTHDR_TYPE])
+ return -EINVAL;
+
+ priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+ priv->op = NFT_EXTHDR_OP_TCPOPT;
+
+ return 0;
+}
+
+static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ switch (priv->type) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ case IPOPT_RR:
+ case IPOPT_RA:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static int nft_exthdr_dccp_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_exthdr *priv = nft_expr_priv(expr);
+ int err = nft_exthdr_init(ctx, expr, tb);
+
+ if (err < 0)
+ return err;
+
+ if (!(priv->flags & NFT_EXTHDR_F_PRESENT))
+ return -EOPNOTSUPP;
+
+ return 0;
}
+#endif
static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
{
@@ -332,7 +672,8 @@ nla_put_failure:
return -1;
}
-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -342,7 +683,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
-static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_set(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_exthdr *priv = nft_expr_priv(expr);
@@ -352,12 +694,57 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
return nft_exthdr_dump_common(skb, priv);
}
+static int nft_exthdr_dump_strip(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+ return nft_exthdr_dump_common(skb, priv);
+}
+
+static bool nft_exthdr_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_exthdr *priv = nft_expr_priv(expr);
+ const struct nft_exthdr *exthdr;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ exthdr = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->type != exthdr->type ||
+ priv->op != exthdr->op ||
+ priv->flags != exthdr->flags ||
+ priv->offset != exthdr->offset ||
+ priv->len != exthdr->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
.type = &nft_exthdr_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
.eval = nft_exthdr_ipv6_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+
+static const struct nft_expr_ops nft_exthdr_ipv4_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_ipv4_eval,
+ .init = nft_exthdr_ipv4_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_ops = {
@@ -366,6 +753,7 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
.eval = nft_exthdr_tcp_eval,
.init = nft_exthdr_init,
.dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
};
static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
@@ -374,8 +762,38 @@ static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
.eval = nft_exthdr_tcp_set_eval,
.init = nft_exthdr_tcp_set_init,
.dump = nft_exthdr_dump_set,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_tcp_strip_eval,
+ .init = nft_exthdr_tcp_strip_init,
+ .dump = nft_exthdr_dump_strip,
+ .reduce = NFT_REDUCE_READONLY,
};
+static const struct nft_expr_ops nft_exthdr_sctp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_sctp_eval,
+ .init = nft_exthdr_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+static const struct nft_expr_ops nft_exthdr_dccp_ops = {
+ .type = &nft_exthdr_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+ .eval = nft_exthdr_dccp_eval,
+ .init = nft_exthdr_dccp_init,
+ .dump = nft_exthdr_dump,
+ .reduce = nft_exthdr_reduce,
+};
+#endif
+
static const struct nft_expr_ops *
nft_exthdr_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -395,11 +813,27 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
return &nft_exthdr_tcp_set_ops;
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_tcp_ops;
- break;
+ return &nft_exthdr_tcp_strip_ops;
case NFT_EXTHDR_OP_IPV6:
if (tb[NFTA_EXTHDR_DREG])
return &nft_exthdr_ipv6_ops;
break;
+ case NFT_EXTHDR_OP_IPV4:
+ if (ctx->family != NFPROTO_IPV6) {
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_ipv4_ops;
+ }
+ break;
+ case NFT_EXTHDR_OP_SCTP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_sctp_ops;
+ break;
+#ifdef CONFIG_NFT_EXTHDR_DCCP
+ case NFT_EXTHDR_OP_DCCP:
+ if (tb[NFTA_EXTHDR_DREG])
+ return &nft_exthdr_dccp_ops;
+ break;
+#endif
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 21df8cccea65..96e02a83c045 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -1,7 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*
* Generic part shared by ipv4 and ipv6 backends.
*/
@@ -16,27 +14,29 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
+#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
+ NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
+ NFTA_FIB_F_PRESENT)
+
const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = {
[NFTA_FIB_DREG] = { .type = NLA_U32 },
[NFTA_FIB_RESULT] = { .type = NLA_U32 },
- [NFTA_FIB_FLAGS] = { .type = NLA_U32 },
+ [NFTA_FIB_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFTA_FIB_F_ALL),
};
EXPORT_SYMBOL(nft_fib_policy);
-#define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \
- NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \
- NFTA_FIB_F_PRESENT)
-
-int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_fib *priv = nft_expr_priv(expr);
unsigned int hooks;
switch (priv->result) {
- case NFT_FIB_RESULT_OIF: /* fallthrough */
+ case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
- hooks = (1 << NF_INET_PRE_ROUTING);
+ hooks = (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
break;
case NFT_FIB_RESULT_ADDRTYPE:
if (priv->flags & NFTA_FIB_F_IIF)
@@ -75,7 +75,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->flags = ntohl(nla_get_be32(tb[NFTA_FIB_FLAGS]));
- if (priv->flags == 0 || (priv->flags & ~NFTA_FIB_F_ALL))
+ if (priv->flags == 0)
return -EINVAL;
if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) ==
@@ -88,7 +88,6 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
- priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
@@ -108,8 +107,8 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
return -EINVAL;
}
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ err = nft_parse_register_store(ctx, tb[NFTA_FIB_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
if (err < 0)
return err;
@@ -117,7 +116,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
}
EXPORT_SYMBOL_GPL(nft_fib_init);
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
{
const struct nft_fib *priv = nft_expr_priv(expr);
@@ -135,21 +134,25 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
EXPORT_SYMBOL_GPL(nft_fib_dump);
void nft_fib_store_result(void *reg, const struct nft_fib *priv,
- const struct nft_pktinfo *pkt, int index)
+ const struct net_device *dev)
{
- struct net_device *dev;
u32 *dreg = reg;
+ int index;
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
- *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
+ index = dev ? dev->ifindex : 0;
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ nft_reg_store8(dreg, !!index);
+ else
+ *dreg = index;
+
break;
case NFT_FIB_RESULT_OIFNAME:
- dev = dev_get_by_index_rcu(nft_net(pkt), index);
if (priv->flags & NFTA_FIB_F_PRESENT)
- *dreg = !!dev;
+ nft_reg_store8(dreg, !!dev);
else
- strncpy(reg, dev ? dev->name : "", IFNAMSIZ);
+ strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ);
break;
default:
WARN_ON_ONCE(1);
@@ -159,5 +162,48 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
}
EXPORT_SYMBOL_GPL(nft_fib_store_result);
+bool nft_fib_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int len = NFT_REG32_SIZE;
+ const struct nft_fib *fib;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ break;
+ case NFT_FIB_RESULT_OIFNAME:
+ if (priv->flags & NFTA_FIB_F_PRESENT)
+ len = NFT_REG32_SIZE;
+ else
+ len = IFNAMSIZ;
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ fib = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->result != fib->result ||
+ priv->flags != fib->flags) {
+ nft_reg_track_update(track, expr, priv->dreg, len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nft_fib_reduce);
+
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Query routing table from nftables");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
diff --git a/net/netfilter/nft_fib_inet.c b/net/netfilter/nft_fib_inet.c
index 9120fc7228f4..666a3741d20b 100644
--- a/net/netfilter/nft_fib_inet.c
+++ b/net/netfilter/nft_fib_inet.c
@@ -1,8 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
@@ -53,6 +49,7 @@ static const struct nft_expr_ops nft_fib_inet_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_inet_type __read_mostly = {
@@ -80,3 +77,4 @@ module_exit(nft_fib_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(1, "fib");
+MODULE_DESCRIPTION("nftables fib inet support");
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index 3997ee36cfbd..9121ec64e918 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2017 Pablo M. Bermudo Garay <pablombg@gmail.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* This code is based on net/netfilter/nft_fib_inet.c, written by
* Florian Westphal <fw@strlen.de>.
*/
@@ -17,6 +14,7 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/ipv6.h>
#include <net/netfilter/nft_fib.h>
@@ -37,6 +35,8 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr,
}
break;
case ETH_P_IPV6:
+ if (!ipv6_mod_enabled())
+ break;
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
@@ -58,6 +58,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
@@ -85,3 +86,4 @@ module_exit(nft_fib_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo M. Bermudo Garay <pablombg@gmail.com>");
MODULE_ALIAS_NFT_AF_EXPR(5, "fib");
+MODULE_DESCRIPTION("nftables netdev fib lookups support");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 6e6b9adf7d38..b8f76c9057fd 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -5,60 +6,44 @@
#include <linux/netfilter.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
+#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/ip.h> /* for ipv4 options. */
+#include <net/ip.h>
+#include <net/flow.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_conntrack_helper.h>
struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
-static int nft_flow_route(const struct nft_pktinfo *pkt,
- const struct nf_conn *ct,
- struct nf_flow_route *route,
- enum ip_conntrack_dir dir)
+static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
{
- struct dst_entry *this_dst = skb_dst(pkt->skb);
- struct dst_entry *other_dst = NULL;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
- fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
- break;
- case NFPROTO_IPV6:
- fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
- fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
- break;
- }
+ if (skb_sec_path(skb))
+ return true;
- nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
- if (!other_dst)
- return -ENOENT;
+ if (family == NFPROTO_IPV4) {
+ const struct ip_options *opt;
- route->tuple[dir].dst = this_dst;
- route->tuple[!dir].dst = other_dst;
+ opt = &(IPCB(skb)->opt);
- return 0;
+ if (unlikely(opt->optlen))
+ return true;
+ }
+
+ return false;
}
-static bool nft_flow_offload_skip(struct sk_buff *skb)
+static void flow_offload_ct_tcp(struct nf_conn *ct)
{
- struct ip_options *opt = &(IPCB(skb)->opt);
-
- if (unlikely(opt->optlen))
- return true;
- if (skb_sec_path(skb))
- return true;
-
- return false;
+ /* conntrack will not see all packets, disable tcp window validation. */
+ spin_lock_bh(&ct->lock);
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ spin_unlock_bh(&ct->lock);
}
static void nft_flow_offload_eval(const struct nft_expr *expr,
@@ -67,15 +52,15 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
struct nf_flowtable *flowtable = &priv->flowtable->data;
- const struct nf_conn_help *help;
+ struct tcphdr _tcph, *tcph = NULL;
+ struct nf_flow_route route = {};
enum ip_conntrack_info ctinfo;
- struct nf_flow_route route;
struct flow_offload *flow;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
int ret;
- if (nft_flow_offload_skip(pkt->skb))
+ if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt)))
goto out;
ct = nf_ct_get(pkt->skb, &ctinfo);
@@ -84,31 +69,54 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
case IPPROTO_TCP:
+ tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
+ sizeof(_tcph), &_tcph);
+ if (unlikely(!tcph || tcph->fin || tcph->rst ||
+ !nf_conntrack_tcp_established(ct)))
+ goto out;
+ break;
case IPPROTO_UDP:
break;
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+ if (ct->status & IPS_NAT_MASK)
+ goto out;
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+ goto out;
+ break;
+ }
+#endif
default:
goto out;
}
- help = nfct_help(ct);
- if (help)
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
goto out;
- if (ctinfo == IP_CT_NEW ||
- ctinfo == IP_CT_RELATED)
+ if (!nf_ct_is_confirmed(ct))
goto out;
if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
goto out;
dir = CTINFO2DIR(ctinfo);
- if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
goto err_flow_route;
- flow = flow_offload_alloc(ct, &route);
+ flow = flow_offload_alloc(ct);
if (!flow)
goto err_flow_alloc;
+ flow_offload_route_init(flow, &route);
+ if (tcph)
+ flow_offload_ct_tcp(ct);
+
+ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
ret = flow_offload_add(flowtable, flow);
if (ret < 0)
goto err_flow_add;
@@ -118,6 +126,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
err_flow_add:
flow_offload_free(flow);
err_flow_alloc:
+ dst_release(route.tuple[dir].dst);
dst_release(route.tuple[!dir].dst);
err_flow_route:
clear_bit(IPS_OFFLOAD_BIT, &ct->status);
@@ -126,14 +135,23 @@ out:
}
static int nft_flow_offload_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
unsigned int hook_mask = (1 << NF_INET_FORWARD);
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
return nft_chain_validate_hooks(ctx->chain, hook_mask);
}
+static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = {
+ [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING,
+ .len = NFT_NAME_MAXLEN - 1 },
+};
+
static int nft_flow_offload_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -145,27 +163,44 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx,
if (!tb[NFTA_FLOW_TABLE_NAME])
return -EINVAL;
- flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME],
- genmask);
+ flowtable = nft_flowtable_lookup(ctx->net, ctx->table,
+ tb[NFTA_FLOW_TABLE_NAME], genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
+ if (!nft_use_inc(&flowtable->use))
+ return -EMFILE;
+
priv->flowtable = flowtable;
- flowtable->use++;
return nf_ct_netns_get(ctx->net, ctx->family);
}
-static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_flow_offload_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
- priv->flowtable->use--;
+ nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase);
+}
+
+static void nft_flow_offload_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+ nft_use_inc_restore(&priv->flowtable->use);
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
nf_ct_netns_put(ctx->net, ctx->family);
}
-static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_flow_offload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_flow_offload *priv = nft_expr_priv(expr);
@@ -184,14 +219,18 @@ static const struct nft_expr_ops nft_flow_offload_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
.eval = nft_flow_offload_eval,
.init = nft_flow_offload_init,
+ .activate = nft_flow_offload_activate,
+ .deactivate = nft_flow_offload_deactivate,
.destroy = nft_flow_offload_destroy,
.validate = nft_flow_offload_validate,
.dump = nft_flow_offload_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_flow_offload_type __read_mostly = {
.name = "flow_offload",
.ops = &nft_flow_offload_ops,
+ .policy = nft_flow_offload_policy,
.maxattr = NFTA_FLOW_MAX,
.owner = THIS_MODULE,
};
@@ -245,3 +284,4 @@ module_exit(nft_flow_offload_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("flow_offload");
+MODULE_DESCRIPTION("nftables hardware flow offload module");
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index d7694e7255a0..152a9fb4d23a 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -15,12 +12,13 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <net/netfilter/nf_dup_netdev.h>
#include <net/neighbour.h>
#include <net/ip.h>
struct nft_fwd_netdev {
- enum nft_registers sreg_dev:8;
+ u8 sreg_dev;
};
static void nft_fwd_netdev_eval(const struct nft_expr *expr,
@@ -29,6 +27,11 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ struct sk_buff *skb = pkt->skb;
+
+ /* This is used by ifb only. */
+ skb->skb_iif = skb->dev->ifindex;
+ skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS);
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
@@ -37,7 +40,7 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
[NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
[NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 },
- [NFTA_FWD_NFPROTO] = { .type = NLA_U32 },
+ [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
@@ -49,11 +52,12 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
if (tb[NFTA_FWD_SREG_DEV] == NULL)
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
}
-static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_netdev_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
@@ -66,9 +70,24 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_fwd_netdev *priv = nft_expr_priv(expr);
+ int oif = ctx->regs[priv->sreg_dev].data.data[0];
+
+ return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif);
+}
+
+static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr)
+{
+ return true;
+}
+
struct nft_fwd_neigh {
- enum nft_registers sreg_dev:8;
- enum nft_registers sreg_addr:8;
+ u8 sreg_dev;
+ u8 sreg_addr;
u8 nfproto;
};
@@ -127,6 +146,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
return;
skb->dev = dev;
+ skb_clear_tstamp(skb);
neigh_xmit(neigh_table, dev, addr, skb);
out:
regs->verdict.code = verdict;
@@ -145,8 +165,6 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
!tb[NFTA_FWD_NFPROTO])
return -EINVAL;
- priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
- priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]);
priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO]));
switch (priv->nfproto) {
@@ -160,14 +178,17 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- err = nft_validate_register_load(priv->sreg_dev, sizeof(int));
+ err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev,
+ sizeof(int));
if (err < 0)
return err;
- return nft_validate_register_load(priv->sreg_addr, addr_len);
+ return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr,
+ addr_len);
}
-static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_neigh_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
@@ -182,6 +203,13 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS));
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.type = &nft_fwd_netdev_type,
@@ -189,6 +217,8 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.eval = nft_fwd_neigh_eval,
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
+ .validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -197,6 +227,10 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.eval = nft_fwd_netdev_eval,
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
+ .validate = nft_fwd_validate,
+ .reduce = NFT_REDUCE_READONLY,
+ .offload = nft_fwd_netdev_offload,
+ .offload_action = nft_fwd_netdev_offload_action,
};
static const struct nft_expr_ops *
@@ -235,4 +269,5 @@ module_exit(nft_fwd_netdev_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("nftables netdev packet forwarding support");
MODULE_ALIAS_NFT_AF_EXPR(5, "fwd");
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index c2d237144f74..5d034bbb6913 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/kernel.h>
@@ -18,14 +14,13 @@
#include <linux/jhash.h>
struct nft_jhash {
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
u8 len;
bool autogen_seed:1;
u32 modulus;
u32 seed;
u32 offset;
- struct nft_set *map;
};
static void nft_jhash_eval(const struct nft_expr *expr,
@@ -42,33 +37,10 @@ static void nft_jhash_eval(const struct nft_expr *expr,
regs->data[priv->dreg] = h + priv->offset;
}
-static void nft_jhash_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_jhash *priv = nft_expr_priv(expr);
- const void *data = &regs->data[priv->sreg];
- const struct nft_set *map = priv->map;
- const struct nft_set_ext *ext;
- u32 result;
- bool found;
-
- result = reciprocal_scale(jhash(data, priv->len, priv->seed),
- priv->modulus) + priv->offset;
-
- found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
- if (!found)
- return;
-
- nft_data_copy(&regs->data[priv->dreg],
- nft_set_ext_data(ext), map->dlen);
-}
-
struct nft_symhash {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
- struct nft_set *map;
};
static void nft_symhash_eval(const struct nft_expr *expr,
@@ -79,44 +51,20 @@ static void nft_symhash_eval(const struct nft_expr *expr,
struct sk_buff *skb = pkt->skb;
u32 h;
- h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus);
+ h = reciprocal_scale(__skb_get_hash_symmetric_net(nft_net(pkt), skb),
+ priv->modulus);
regs->data[priv->dreg] = h + priv->offset;
}
-static void nft_symhash_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_symhash *priv = nft_expr_priv(expr);
- struct sk_buff *skb = pkt->skb;
- const struct nft_set *map = priv->map;
- const struct nft_set_ext *ext;
- u32 result;
- bool found;
-
- result = reciprocal_scale(__skb_get_hash_symmetric(skb),
- priv->modulus) + priv->offset;
-
- found = map->ops->lookup(nft_net(pkt), map, &result, &ext);
- if (!found)
- return;
-
- nft_data_copy(&regs->data[priv->dreg],
- nft_set_ext_data(ext), map->dlen);
-}
-
static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = {
[NFTA_HASH_SREG] = { .type = NLA_U32 },
[NFTA_HASH_DREG] = { .type = NLA_U32 },
- [NFTA_HASH_LEN] = { .type = NLA_U32 },
+ [NFTA_HASH_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_HASH_MODULUS] = { .type = NLA_U32 },
[NFTA_HASH_SEED] = { .type = NLA_U32 },
[NFTA_HASH_OFFSET] = { .type = NLA_U32 },
[NFTA_HASH_TYPE] = { .type = NLA_U32 },
- [NFTA_HASH_SET_NAME] = { .type = NLA_STRING,
- .len = NFT_SET_MAXNAMELEN - 1 },
- [NFTA_HASH_SET_ID] = { .type = NLA_U32 },
};
static int nft_jhash_init(const struct nft_ctx *ctx,
@@ -136,9 +84,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
if (err < 0)
return err;
@@ -147,6 +92,10 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len);
+ if (err < 0)
+ return err;
+
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
if (priv->modulus < 1)
return -ERANGE;
@@ -161,23 +110,8 @@ static int nft_jhash_init(const struct nft_ctx *ctx,
get_random_bytes(&priv->seed, sizeof(priv->seed));
}
- return nft_validate_register_load(priv->sreg, len) &&
- nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
-}
-
-static int nft_jhash_map_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_jhash *priv = nft_expr_priv(expr);
- u8 genmask = nft_genmask_next(ctx->net);
-
- nft_jhash_init(ctx, expr, tb);
- priv->map = nft_set_lookup_global(ctx->net, ctx->table,
- tb[NFTA_HASH_SET_NAME],
- tb[NFTA_HASH_SET_ID], genmask);
- return PTR_ERR_OR_ZERO(priv->map);
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
static int nft_symhash_init(const struct nft_ctx *ctx,
@@ -193,35 +127,20 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
if (tb[NFTA_HASH_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET]));
- priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
-
priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS]));
- if (priv->modulus <= 1)
+ if (priv->modulus < 1)
return -ERANGE;
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
-}
-
-static int nft_symhash_map_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_jhash *priv = nft_expr_priv(expr);
- u8 genmask = nft_genmask_next(ctx->net);
-
- nft_symhash_init(ctx, expr, tb);
- priv->map = nft_set_lookup_global(ctx->net, ctx->table,
- tb[NFTA_HASH_SET_NAME],
- tb[NFTA_HASH_SET_ID], genmask);
- return PTR_ERR_OR_ZERO(priv->map);
+ return nft_parse_register_store(ctx, tb[NFTA_HASH_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ sizeof(u32));
}
static int nft_jhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_jhash *priv = nft_expr_priv(expr);
@@ -247,20 +166,18 @@ nla_put_failure:
return -1;
}
-static int nft_jhash_map_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static bool nft_jhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
{
const struct nft_jhash *priv = nft_expr_priv(expr);
- if (nft_jhash_dump(skb, expr) ||
- nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
- return -1;
+ nft_reg_track_cancel(track, priv->dreg, sizeof(u32));
- return 0;
+ return false;
}
static int nft_symhash_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_symhash *priv = nft_expr_priv(expr);
@@ -279,16 +196,28 @@ nla_put_failure:
return -1;
}
-static int nft_symhash_map_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+static bool nft_symhash_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
{
- const struct nft_symhash *priv = nft_expr_priv(expr);
+ struct nft_symhash *priv = nft_expr_priv(expr);
+ struct nft_symhash *symhash;
- if (nft_symhash_dump(skb, expr) ||
- nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name))
- return -1;
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
- return 0;
+ symhash = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->offset != symhash->offset ||
+ priv->modulus != symhash->modulus) {
+ nft_reg_track_update(track, expr, priv->dreg, sizeof(u32));
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
}
static struct nft_expr_type nft_hash_type;
@@ -298,14 +227,7 @@ static const struct nft_expr_ops nft_jhash_ops = {
.eval = nft_jhash_eval,
.init = nft_jhash_init,
.dump = nft_jhash_dump,
-};
-
-static const struct nft_expr_ops nft_jhash_map_ops = {
- .type = &nft_hash_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)),
- .eval = nft_jhash_map_eval,
- .init = nft_jhash_map_init,
- .dump = nft_jhash_map_dump,
+ .reduce = nft_jhash_reduce,
};
static const struct nft_expr_ops nft_symhash_ops = {
@@ -314,14 +236,7 @@ static const struct nft_expr_ops nft_symhash_ops = {
.eval = nft_symhash_eval,
.init = nft_symhash_init,
.dump = nft_symhash_dump,
-};
-
-static const struct nft_expr_ops nft_symhash_map_ops = {
- .type = &nft_hash_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)),
- .eval = nft_symhash_map_eval,
- .init = nft_symhash_map_init,
- .dump = nft_symhash_map_dump,
+ .reduce = nft_symhash_reduce,
};
static const struct nft_expr_ops *
@@ -336,12 +251,8 @@ nft_hash_select_ops(const struct nft_ctx *ctx,
type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE]));
switch (type) {
case NFT_HASH_SYM:
- if (tb[NFTA_HASH_SET_NAME])
- return &nft_symhash_map_ops;
return &nft_symhash_ops;
case NFT_HASH_JENKINS:
- if (tb[NFTA_HASH_SET_NAME])
- return &nft_jhash_map_ops;
return &nft_jhash_ops;
default:
break;
@@ -373,3 +284,4 @@ module_exit(nft_hash_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("hash");
+MODULE_DESCRIPTION("Netfilter nftables hash module");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 0777a93211e2..02ee5fb69871 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -16,10 +13,11 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
-static void nft_immediate_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_immediate_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
@@ -31,31 +29,62 @@ static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = {
[NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED },
};
+static enum nft_data_types nft_reg_to_type(const struct nlattr *nla)
+{
+ enum nft_data_types type;
+ u8 reg;
+
+ reg = ntohl(nla_get_be32(nla));
+ if (reg == NFT_REG_VERDICT)
+ type = NFT_DATA_VERDICT;
+ else
+ type = NFT_DATA_VALUE;
+
+ return type;
+}
+
static int nft_immediate_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
struct nft_immediate_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc;
+ struct nft_data_desc desc = {
+ .size = sizeof(priv->data),
+ };
int err;
if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
tb[NFTA_IMMEDIATE_DATA] == NULL)
return -EINVAL;
- err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
- tb[NFTA_IMMEDIATE_DATA]);
+ desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]);
+ err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]);
if (err < 0)
return err;
priv->dlen = desc.len;
- priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, &priv->data,
- desc.type, desc.len);
+ err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG],
+ &priv->dreg, &priv->data, desc.type,
+ desc.len);
if (err < 0)
goto err1;
+ if (priv->dreg == NFT_REG_VERDICT) {
+ struct nft_chain *chain = priv->data.verdict.chain;
+
+ switch (priv->data.verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ err = nf_tables_bind_chain(ctx, chain);
+ if (err < 0)
+ goto err1;
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
err1:
@@ -67,19 +96,140 @@ static void nft_immediate_activate(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+ struct nft_rule *rule;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_activate(&chain_ctx, rule);
+
+ nft_clear(ctx->net, chain);
+ break;
+ default:
+ break;
+ }
+ }
return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg));
}
+static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx,
+ struct nft_chain *chain,
+ enum nft_trans_phase phase)
+{
+ struct nft_ctx chain_ctx;
+ struct nft_rule *rule;
+
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ list_for_each_entry(rule, &chain->rules, list)
+ nft_rule_expr_deactivate(&chain_ctx, rule, phase);
+}
+
static void nft_immediate_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_chain *chain;
+
+ if (priv->dreg == NFT_REG_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+ if (!nft_chain_binding(chain))
+ break;
+
+ switch (phase) {
+ case NFT_TRANS_PREPARE_ERROR:
+ nf_tables_unbind_chain(ctx, chain);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ case NFT_TRANS_PREPARE:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_deactivate_next(ctx->net, chain);
+ break;
+ default:
+ nft_immediate_chain_deactivate(ctx, chain, phase);
+ nft_chain_del(chain);
+ chain->bound = false;
+ nft_use_dec(&chain->table->use);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (phase == NFT_TRANS_COMMIT)
+ return;
return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void nft_immediate_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ const struct nft_data *data = &priv->data;
+ struct nft_rule *rule, *n;
+ struct nft_ctx chain_ctx;
+ struct nft_chain *chain;
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ return;
+
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ chain = data->verdict.chain;
+
+ if (!nft_chain_binding(chain))
+ break;
+
+ /* Rule construction failed, but chain is already bound:
+ * let the transaction records release this chain and its rules.
+ */
+ if (chain->bound) {
+ nft_use_dec(&chain->use);
+ break;
+ }
+
+ /* Rule has been deleted, release chain and its rules. */
+ chain_ctx = *ctx;
+ chain_ctx.chain = chain;
+
+ nft_use_dec(&chain->use);
+ list_for_each_entry_safe(rule, n, &chain->rules, list) {
+ nft_use_dec(&chain->use);
+ list_del(&rule->list);
+ nf_tables_rule_destroy(&chain_ctx, rule);
+ }
+ nf_tables_chain_destroy(chain);
+ break;
+ default:
+ break;
+ }
+}
+
+static int nft_immediate_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
@@ -94,8 +244,7 @@ nla_put_failure:
}
static int nft_immediate_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
struct nft_ctx *pctx = (struct nft_ctx *)ctx;
@@ -123,6 +272,65 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
return 0;
}
+static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_immediate_expr *priv)
+{
+ struct flow_action_entry *entry;
+ const struct nft_data *data;
+
+ entry = &flow->rule->action.entries[ctx->num_actions++];
+
+ data = &priv->data;
+ switch (data->verdict.code) {
+ case NF_ACCEPT:
+ entry->id = FLOW_ACTION_ACCEPT;
+ break;
+ case NF_DROP:
+ entry->id = FLOW_ACTION_DROP;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_immediate_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return nft_immediate_offload_verdict(ctx, flow, priv);
+
+ memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data));
+
+ return 0;
+}
+
+static bool nft_immediate_offload_action(const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg == NFT_REG_VERDICT)
+ return true;
+
+ return false;
+}
+
+static bool nft_immediate_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+
+ if (priv->dreg != NFT_REG_VERDICT)
+ nft_reg_track_cancel(track, priv->dreg, priv->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_imm_ops = {
.type = &nft_imm_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)),
@@ -130,8 +338,12 @@ static const struct nft_expr_ops nft_imm_ops = {
.init = nft_immediate_init,
.activate = nft_immediate_activate,
.deactivate = nft_immediate_deactivate,
+ .destroy = nft_immediate_destroy,
.dump = nft_immediate_dump,
.validate = nft_immediate_validate,
+ .reduce = nft_immediate_reduce,
+ .offload = nft_immediate_offload,
+ .offload_action = nft_immediate_offload_action,
};
struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
new file mode 100644
index 000000000000..c4569d4b9228
--- /dev/null
+++ b/net/netfilter/nft_inner.c
@@ -0,0 +1,431 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/gre.h>
+#include <net/geneve.h>
+#include <net/ip.h>
+#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+struct nft_inner_tun_ctx_locked {
+ struct nft_inner_tun_ctx ctx;
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+/* Same layout as nft_expr but it embeds the private expression data area. */
+struct __nft_expr {
+ const struct nft_expr_ops *ops;
+ union {
+ struct nft_payload payload;
+ struct nft_meta meta;
+ } __attribute__((aligned(__alignof__(u64))));
+};
+
+enum {
+ NFT_INNER_EXPR_PAYLOAD,
+ NFT_INNER_EXPR_META,
+};
+
+struct nft_inner {
+ u8 flags;
+ u8 hdrsize;
+ u8 type;
+ u8 expr_type;
+
+ struct __nft_expr expr;
+};
+
+static int nft_inner_parse_l2l3(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 off)
+{
+ __be16 llproto, outer_llproto;
+ u32 nhoff, thoff;
+
+ if (priv->flags & NFT_INNER_LL) {
+ struct vlan_ethhdr *veth, _veth;
+ struct ethhdr *eth, _eth;
+ u32 hdrsize;
+
+ eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth);
+ if (!eth)
+ return -1;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ case htons(ETH_P_IPV6):
+ llproto = eth->h_proto;
+ hdrsize = sizeof(_eth);
+ break;
+ case htons(ETH_P_8021Q):
+ veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth);
+ if (!veth)
+ return -1;
+
+ outer_llproto = veth->h_vlan_encapsulated_proto;
+ llproto = veth->h_vlan_proto;
+ hdrsize = sizeof(_veth);
+ break;
+ default:
+ return -1;
+ }
+
+ ctx->inner_lloff = off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL;
+ off += hdrsize;
+ } else {
+ struct iphdr *iph;
+ u32 _version;
+
+ iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version);
+ if (!iph)
+ return -1;
+
+ switch (iph->version) {
+ case 4:
+ llproto = htons(ETH_P_IP);
+ break;
+ case 6:
+ llproto = htons(ETH_P_IPV6);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ ctx->llproto = llproto;
+ if (llproto == htons(ETH_P_8021Q))
+ llproto = outer_llproto;
+
+ nhoff = off;
+
+ switch (llproto) {
+ case htons(ETH_P_IP): {
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ if (iph->ihl < 5 || iph->version != 4)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff + (iph->ihl * 4);
+ if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) {
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = iph->protocol;
+ }
+ }
+ break;
+ case htons(ETH_P_IPV6): {
+ struct ipv6hdr *ip6h, _ip6h;
+ int fh_flags = IP6_FH_F_AUTH;
+ unsigned short fragoff;
+ int l4proto;
+
+ ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h);
+ if (!ip6h)
+ return -1;
+
+ if (ip6h->version != 6)
+ return -1;
+
+ ctx->inner_nhoff = nhoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+ thoff = nhoff;
+ l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags);
+ if (l4proto < 0 || thoff > U16_MAX)
+ return -1;
+
+ if (fragoff == 0) {
+ thoff = nhoff + sizeof(_ip6h);
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ ctx->inner_thoff = thoff;
+ ctx->l4proto = l4proto;
+ }
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *ctx, u32 *off)
+{
+ if (pkt->tprot == IPPROTO_GRE) {
+ ctx->inner_tunoff = pkt->thoff;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ return 0;
+ }
+
+ if (pkt->tprot != IPPROTO_UDP)
+ return -1;
+
+ ctx->inner_tunoff = *off;
+ ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+ *off += priv->hdrsize;
+
+ switch (priv->type) {
+ case NFT_INNER_GENEVE: {
+ struct genevehdr *gnvh, _gnvh;
+
+ gnvh = skb_header_pointer(pkt->skb, pkt->inneroff,
+ sizeof(_gnvh), &_gnvh);
+ if (!gnvh)
+ return -1;
+
+ *off += gnvh->opt_len * 4;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_inner_parse(const struct nft_inner *priv,
+ struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ u32 off = pkt->inneroff;
+
+ if (priv->flags & NFT_INNER_HDRSIZE &&
+ nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0)
+ return -1;
+
+ if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
+ if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0)
+ return -1;
+ } else if (priv->flags & NFT_INNER_TH) {
+ tun_ctx->inner_thoff = off;
+ tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+ }
+
+ tun_ctx->type = priv->type;
+ tun_ctx->cookie = (unsigned long)pkt->skb;
+ pkt->flags |= NFT_PKTINFO_INNER_FULL;
+
+ return 0;
+}
+
+static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
+ local_bh_enable();
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ return false;
+ }
+ *tun_ctx = *this_cpu_tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+
+ return true;
+}
+
+static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
+ const struct nft_inner_tun_ctx *tun_ctx)
+{
+ struct nft_inner_tun_ctx *this_cpu_tun_ctx;
+
+ local_bh_disable();
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
+ if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
+ *this_cpu_tun_ctx = *tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ local_bh_enable();
+}
+
+static bool nft_inner_parse_needed(const struct nft_inner *priv,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
+ return true;
+
+ if (!nft_inner_restore_tun_ctx(pkt, tun_ctx))
+ return true;
+
+ if (priv->type != tun_ctx->type)
+ return true;
+
+ return false;
+}
+
+static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+ struct nft_inner_tun_ctx tun_ctx = {};
+
+ if (nft_payload_inner_offset(pkt) < 0)
+ goto err;
+
+ if (nft_inner_parse_needed(priv, pkt, &tun_ctx) &&
+ nft_inner_parse(priv, (struct nft_pktinfo *)pkt, &tun_ctx) < 0)
+ goto err;
+
+ switch (priv->expr_type) {
+ case NFT_INNER_EXPR_PAYLOAD:
+ nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+ break;
+ case NFT_INNER_EXPR_META:
+ nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ nft_inner_save_tun_ctx(pkt, &tun_ctx);
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = {
+ [NFTA_INNER_NUM] = { .type = NLA_U32 },
+ [NFTA_INNER_FLAGS] = { .type = NLA_U32 },
+ [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 },
+ [NFTA_INNER_TYPE] = { .type = NLA_U32 },
+ [NFTA_INNER_EXPR] = { .type = NLA_NESTED },
+};
+
+struct nft_expr_info {
+ const struct nft_expr_ops *ops;
+ const struct nlattr *attr;
+ struct nlattr *tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nft_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_inner *priv = nft_expr_priv(expr);
+ u32 flags, hdrsize, type, num;
+ struct nft_expr_info expr_info;
+ int err;
+
+ if (!tb[NFTA_INNER_FLAGS] ||
+ !tb[NFTA_INNER_NUM] ||
+ !tb[NFTA_INNER_HDRSIZE] ||
+ !tb[NFTA_INNER_TYPE] ||
+ !tb[NFTA_INNER_EXPR])
+ return -EINVAL;
+
+ flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS]));
+ if (flags & ~NFT_INNER_MASK)
+ return -EOPNOTSUPP;
+
+ num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM]));
+ if (num != 0)
+ return -EOPNOTSUPP;
+
+ hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE]));
+ type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE]));
+
+ if (type > U8_MAX)
+ return -EINVAL;
+
+ if (flags & NFT_INNER_HDRSIZE) {
+ if (hdrsize == 0 || hdrsize > 64)
+ return -EOPNOTSUPP;
+ }
+
+ priv->flags = flags;
+ priv->hdrsize = hdrsize;
+ priv->type = type;
+
+ err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info);
+ if (err < 0)
+ return err;
+
+ priv->expr.ops = expr_info.ops;
+
+ if (!strcmp(expr_info.ops->type->name, "payload"))
+ priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+ else if (!strcmp(expr_info.ops->type->name, "meta"))
+ priv->expr_type = NFT_INNER_EXPR_META;
+ else
+ return -EINVAL;
+
+ err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr,
+ (const struct nlattr * const*)expr_info.tb);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_inner_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ const struct nft_inner *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) ||
+ nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) ||
+ nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) ||
+ nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize)))
+ goto nla_put_failure;
+
+ if (nft_expr_dump(skb, NFTA_INNER_EXPR,
+ (struct nft_expr *)&priv->expr, reset) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_inner_ops = {
+ .type = &nft_inner_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)),
+ .eval = nft_inner_eval,
+ .init = nft_inner_init,
+ .dump = nft_inner_dump,
+};
+
+struct nft_expr_type nft_inner_type __read_mostly = {
+ .name = "inner",
+ .ops = &nft_inner_ops,
+ .policy = nft_inner_policy,
+ .maxattr = NFTA_INNER_MAX,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
new file mode 100644
index 000000000000..de1b6066bfa8
--- /dev/null
+++ b/net/netfilter/nft_last.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+struct nft_last {
+ unsigned long jiffies;
+ unsigned int set;
+};
+
+struct nft_last_priv {
+ struct nft_last *last;
+};
+
+static const struct nla_policy nft_last_policy[NFTA_LAST_MAX + 1] = {
+ [NFTA_LAST_SET] = { .type = NLA_U32 },
+ [NFTA_LAST_MSECS] = { .type = NLA_U64 },
+};
+
+static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last;
+ u64 last_jiffies;
+ int err;
+
+ last = kzalloc(sizeof(*last), GFP_KERNEL_ACCOUNT);
+ if (!last)
+ return -ENOMEM;
+
+ if (tb[NFTA_LAST_SET])
+ last->set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+
+ if (last->set && tb[NFTA_LAST_MSECS]) {
+ err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
+ if (err < 0)
+ goto err;
+
+ last->jiffies = jiffies - (unsigned long)last_jiffies;
+ }
+ priv->last = last;
+
+ return 0;
+err:
+ kfree(last);
+
+ return err;
+}
+
+static void nft_last_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+
+ if (READ_ONCE(last->jiffies) != jiffies)
+ WRITE_ONCE(last->jiffies, jiffies);
+ if (READ_ONCE(last->set) == 0)
+ WRITE_ONCE(last->set, 1);
+}
+
+static int nft_last_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+ struct nft_last *last = priv->last;
+ unsigned long last_jiffies = READ_ONCE(last->jiffies);
+ u32 last_set = READ_ONCE(last->set);
+ __be64 msecs;
+
+ if (time_before(jiffies, last_jiffies)) {
+ WRITE_ONCE(last->set, 0);
+ last_set = 0;
+ }
+
+ if (last_set)
+ msecs = nf_jiffies64_to_msecs(jiffies - last_jiffies);
+ else
+ msecs = 0;
+
+ if (nla_put_be32(skb, NFTA_LAST_SET, htonl(last_set)) ||
+ nla_put_be64(skb, NFTA_LAST_MSECS, msecs, NFTA_LAST_PAD))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_last_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_last_priv *priv = nft_expr_priv(expr);
+
+ kfree(priv->last);
+}
+
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
+{
+ struct nft_last_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_last_priv *priv_src = nft_expr_priv(src);
+
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp);
+ if (!priv_dst->last)
+ return -ENOMEM;
+
+ priv_dst->last->set = priv_src->last->set;
+ priv_dst->last->jiffies = priv_src->last->jiffies;
+
+ return 0;
+}
+
+static const struct nft_expr_ops nft_last_ops = {
+ .type = &nft_last_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_last_priv)),
+ .eval = nft_last_eval,
+ .init = nft_last_init,
+ .destroy = nft_last_destroy,
+ .clone = nft_last_clone,
+ .dump = nft_last_dump,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+struct nft_expr_type nft_last_type __read_mostly = {
+ .name = "last",
+ .ops = &nft_last_ops,
+ .policy = nft_last_policy,
+ .maxattr = NFTA_LAST_MAX,
+ .flags = NFT_EXPR_STATEFUL,
+ .owner = THIS_MODULE,
+};
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 72f13a1144dd..21d26b79b460 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -21,6 +18,10 @@ struct nft_limit {
spinlock_t lock;
u64 last;
u64 tokens;
+};
+
+struct nft_limit_priv {
+ struct nft_limit *limit;
u64 tokens_max;
u64 rate;
u64 nsecs;
@@ -28,93 +29,111 @@ struct nft_limit {
bool invert;
};
-static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
+static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
{
u64 now, tokens;
s64 delta;
- spin_lock_bh(&limit->lock);
+ spin_lock_bh(&priv->limit->lock);
now = ktime_get_ns();
- tokens = limit->tokens + now - limit->last;
- if (tokens > limit->tokens_max)
- tokens = limit->tokens_max;
+ tokens = priv->limit->tokens + now - priv->limit->last;
+ if (tokens > priv->tokens_max)
+ tokens = priv->tokens_max;
- limit->last = now;
+ priv->limit->last = now;
delta = tokens - cost;
if (delta >= 0) {
- limit->tokens = delta;
- spin_unlock_bh(&limit->lock);
- return limit->invert;
+ priv->limit->tokens = delta;
+ spin_unlock_bh(&priv->limit->lock);
+ return priv->invert;
}
- limit->tokens = tokens;
- spin_unlock_bh(&limit->lock);
- return !limit->invert;
+ priv->limit->tokens = tokens;
+ spin_unlock_bh(&priv->limit->lock);
+ return !priv->invert;
}
/* Use same default as in iptables. */
#define NFT_LIMIT_PKT_BURST_DEFAULT 5
-static int nft_limit_init(struct nft_limit *limit,
+static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
- u64 unit, tokens;
+ u64 unit, tokens, rate_with_burst;
+ bool invert = false;
if (tb[NFTA_LIMIT_RATE] == NULL ||
tb[NFTA_LIMIT_UNIT] == NULL)
return -EINVAL;
- limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
+ if (priv->rate == 0)
+ return -EINVAL;
+
unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
- limit->nsecs = unit * NSEC_PER_SEC;
- if (limit->rate == 0 || limit->nsecs < unit)
+ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
return -EOVERFLOW;
if (tb[NFTA_LIMIT_BURST])
- limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
+ priv->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST]));
- if (pkts && limit->burst == 0)
- limit->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+ if (pkts && priv->burst == 0)
+ priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
- if (limit->rate + limit->burst < limit->rate)
+ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
return -EOVERFLOW;
if (pkts) {
- tokens = div_u64(limit->nsecs, limit->rate) * limit->burst;
+ u64 tmp = div64_u64(priv->nsecs, priv->rate);
+
+ if (check_mul_overflow(tmp, priv->burst, &tokens))
+ return -EOVERFLOW;
} else {
+ u64 tmp;
+
/* The token bucket size limits the number of tokens can be
* accumulated. tokens_max specifies the bucket size.
* tokens_max = unit * (rate + burst) / rate.
*/
- tokens = div_u64(limit->nsecs * (limit->rate + limit->burst),
- limit->rate);
- }
+ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
+ return -EOVERFLOW;
- limit->tokens = tokens;
- limit->tokens_max = limit->tokens;
+ tokens = div64_u64(tmp, priv->rate);
+ }
if (tb[NFTA_LIMIT_FLAGS]) {
u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
+ if (flags & ~NFT_LIMIT_F_INV)
+ return -EOPNOTSUPP;
+
if (flags & NFT_LIMIT_F_INV)
- limit->invert = true;
+ invert = true;
}
- limit->last = ktime_get_ns();
- spin_lock_init(&limit->lock);
+
+ priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
+ if (!priv->limit)
+ return -ENOMEM;
+
+ priv->limit->tokens = tokens;
+ priv->tokens_max = priv->limit->tokens;
+ priv->invert = invert;
+ priv->limit->last = ktime_get_ns();
+ spin_lock_init(&priv->limit->lock);
return 0;
}
-static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
+static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit_priv *priv,
enum nft_limit_type type)
{
- u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0;
- u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
+ u32 flags = priv->invert ? NFT_LIMIT_F_INV : 0;
+ u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC);
- if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate),
+ if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate),
NFTA_LIMIT_PAD) ||
nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs),
NFTA_LIMIT_PAD) ||
- nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
+ nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(priv->burst)) ||
nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
goto nla_put_failure;
@@ -124,8 +143,34 @@ nla_put_failure:
return -1;
}
-struct nft_limit_pkts {
- struct nft_limit limit;
+static void nft_limit_destroy(const struct nft_ctx *ctx,
+ const struct nft_limit_priv *priv)
+{
+ kfree(priv->limit);
+}
+
+static int nft_limit_clone(struct nft_limit_priv *priv_dst,
+ const struct nft_limit_priv *priv_src, gfp_t gfp)
+{
+ priv_dst->tokens_max = priv_src->tokens_max;
+ priv_dst->rate = priv_src->rate;
+ priv_dst->nsecs = priv_src->nsecs;
+ priv_dst->burst = priv_src->burst;
+ priv_dst->invert = priv_src->invert;
+
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp);
+ if (!priv_dst->limit)
+ return -ENOMEM;
+
+ spin_lock_init(&priv_dst->limit->lock);
+ priv_dst->limit->tokens = priv_src->tokens_max;
+ priv_dst->limit->last = ktime_get_ns();
+
+ return 0;
+}
+
+struct nft_limit_priv_pkts {
+ struct nft_limit_priv limit;
u64 cost;
};
@@ -133,7 +178,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -151,7 +196,7 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -162,27 +207,50 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_pkts_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
- const struct nft_limit_pkts *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
+{
+ struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
+
+ priv_dst->cost = priv_src->cost;
+
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp);
+}
+
static struct nft_expr_type nft_limit_type;
static const struct nft_expr_ops nft_limit_pkts_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.eval = nft_limit_pkts_eval,
.init = nft_limit_pkts_init,
+ .destroy = nft_limit_pkts_destroy,
+ .clone = nft_limit_pkts_clone,
.dump = nft_limit_pkts_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static void nft_limit_bytes_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -193,25 +261,45 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
- struct nft_limit *priv = nft_expr_priv(expr);
+ struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_init(priv, tb, false);
}
static int nft_limit_bytes_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
- const struct nft_limit *priv = nft_expr_priv(expr);
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_limit_priv *priv = nft_expr_priv(expr);
+
+ nft_limit_destroy(ctx, priv);
+}
+
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
+{
+ struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
+ struct nft_limit_priv *priv_src = nft_expr_priv(src);
+
+ return nft_limit_clone(priv_dst, priv_src, gfp);
+}
+
static const struct nft_expr_ops nft_limit_bytes_ops = {
.type = &nft_limit_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv)),
.eval = nft_limit_bytes_eval,
.init = nft_limit_bytes_init,
.dump = nft_limit_bytes_dump,
+ .clone = nft_limit_bytes_clone,
+ .destroy = nft_limit_bytes_destroy,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -243,7 +331,7 @@ static void nft_limit_obj_pkts_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
if (nft_limit_eval(&priv->limit, priv->cost))
regs->verdict.code = NFT_BREAK;
@@ -253,7 +341,7 @@ static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit_pkts *priv = nft_obj_data(obj);
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
int err;
err = nft_limit_init(&priv->limit, tb, true);
@@ -268,16 +356,25 @@ static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit_pkts *priv = nft_obj_data(obj);
+ const struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
}
+static void nft_limit_obj_pkts_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv_pkts *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, &priv->limit);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_pkts_ops = {
.type = &nft_limit_obj_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_priv_pkts)),
.init = nft_limit_obj_pkts_init,
+ .destroy = nft_limit_obj_pkts_destroy,
.eval = nft_limit_obj_pkts_eval,
.dump = nft_limit_obj_pkts_dump,
};
@@ -286,7 +383,7 @@ static void nft_limit_obj_bytes_eval(struct nft_object *obj,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
if (nft_limit_eval(priv, cost))
@@ -297,7 +394,7 @@ static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
{
- struct nft_limit *priv = nft_obj_data(obj);
+ struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_init(priv, tb, false);
}
@@ -306,16 +403,25 @@ static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
struct nft_object *obj,
bool reset)
{
- const struct nft_limit *priv = nft_obj_data(obj);
+ const struct nft_limit_priv *priv = nft_obj_data(obj);
return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
}
+static void nft_limit_obj_bytes_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_limit_priv *priv = nft_obj_data(obj);
+
+ nft_limit_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_limit_obj_type;
static const struct nft_object_ops nft_limit_obj_bytes_ops = {
.type = &nft_limit_obj_type,
- .size = sizeof(struct nft_limit),
+ .size = sizeof(struct nft_limit_priv),
.init = nft_limit_obj_bytes_init,
+ .destroy = nft_limit_obj_bytes_destroy,
.eval = nft_limit_obj_bytes_eval,
.dump = nft_limit_obj_bytes_dump,
};
@@ -375,3 +481,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("limit");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT);
+MODULE_DESCRIPTION("nftables limit expression support");
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 655187bed5d8..e35588137995 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2012-2014 Pablo Neira Ayuso <pablo@netfilter.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -131,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
[NFTA_LOG_FLAGS] = { .type = NLA_U32 },
};
+static int nft_log_modprobe(struct net *net, enum nf_log_type t)
+{
+ switch (t) {
+ case NF_LOG_TYPE_LOG:
+ return nft_request_module(net, "%s", "nf_log_syslog");
+ case NF_LOG_TYPE_ULOG:
+ return nft_request_module(net, "%s", "nfnetlink_log");
+ case NF_LOG_TYPE_MAX:
+ break;
+ }
+
+ return -ENOENT;
+}
+
static int nft_log_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -152,10 +163,10 @@ static int nft_log_init(const struct nft_ctx *ctx,
nla = tb[NFTA_LOG_PREFIX];
if (nla != NULL) {
- priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL);
+ priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT);
if (priv->prefix == NULL)
return -ENOMEM;
- nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1);
+ nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);
} else {
priv->prefix = (char *)nft_log_null_prefix;
}
@@ -200,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx,
return 0;
err = nf_logger_find_get(ctx->family, li->type);
- if (err < 0)
+ if (err < 0) {
+ if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN)
+ err = -EAGAIN;
+
goto err1;
+ }
return 0;
@@ -226,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
nf_logger_put(ctx->family, li->type);
}
-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_log *priv = nft_expr_priv(expr);
const struct nf_loginfo *li = &priv->loginfo;
@@ -275,6 +291,7 @@ static const struct nft_expr_ops nft_log_ops = {
.init = nft_log_init,
.destroy = nft_log_destroy,
.dump = nft_log_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_log_type __read_mostly = {
@@ -301,3 +318,4 @@ module_exit(nft_log_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_EXPR("log");
+MODULE_DESCRIPTION("Netfilter nf_tables log module");
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 227b2b15a19c..fc2d7c5d83c8 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -20,32 +17,108 @@
struct nft_lookup {
struct nft_set *set;
- enum nft_registers sreg:8;
- enum nft_registers dreg:8;
+ u8 sreg;
+ u8 dreg;
+ bool dreg_set;
bool invert;
struct nft_set_binding binding;
};
+static const struct nft_set_ext *
+__nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ if (set->ops == &nft_set_hash_fast_type.ops)
+ return nft_hash_lookup_fast(net, set, key);
+ if (set->ops == &nft_set_hash_type.ops)
+ return nft_hash_lookup(net, set, key);
+
+ if (set->ops == &nft_set_rhash_type.ops)
+ return nft_rhash_lookup(net, set, key);
+
+ if (set->ops == &nft_set_bitmap_type.ops)
+ return nft_bitmap_lookup(net, set, key);
+
+ if (set->ops == &nft_set_pipapo_type.ops)
+ return nft_pipapo_lookup(net, set, key);
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (set->ops == &nft_set_pipapo_avx2_type.ops)
+ return nft_pipapo_avx2_lookup(net, set, key);
+#endif
+
+ if (set->ops == &nft_set_rbtree_type.ops)
+ return nft_rbtree_lookup(net, set, key);
+
+ WARN_ON_ONCE(1);
+#endif
+ return set->ops->lookup(net, set, key);
+}
+
+static unsigned int nft_base_seq(const struct net *net)
+{
+ /* pairs with smp_store_release() in nf_tables_commit() */
+ return smp_load_acquire(&net->nft.base_seq);
+}
+
+static bool nft_lookup_should_retry(const struct net *net, unsigned int seq)
+{
+ return unlikely(seq != nft_base_seq(net));
+}
+
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ const struct nft_set_ext *ext;
+ unsigned int base_seq;
+
+ do {
+ base_seq = nft_base_seq(net);
+
+ ext = __nft_set_do_lookup(net, set, key);
+ if (ext)
+ break;
+ /* No match? There is a small chance that lookup was
+ * performed in the old generation, but nf_tables_commit()
+ * already unlinked a (matching) element.
+ *
+ * We need to repeat the lookup to make sure that we didn't
+ * miss a matching element in the new generation.
+ */
+ } while (nft_lookup_should_retry(net, base_seq));
+
+ return ext;
+}
+EXPORT_SYMBOL_GPL(nft_set_do_lookup);
+
void nft_lookup_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
+ const struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext) ^ priv->invert;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ found = !!ext ^ priv->invert;
if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
- if (set->flags & NFT_SET_MAP)
- nft_data_copy(&regs->data[priv->dreg],
- nft_set_ext_data(ext), set->dlen);
+ if (ext) {
+ if (priv->dreg_set)
+ nft_data_copy(&regs->data[priv->dreg],
+ nft_set_ext_data(ext), set->dlen);
+ nft_set_elem_update_expr(ext, regs, pkt);
+ }
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
@@ -54,7 +127,8 @@ static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
[NFTA_LOOKUP_SET_ID] = { .type = NLA_U32 },
[NFTA_LOOKUP_SREG] = { .type = NLA_U32 },
[NFTA_LOOKUP_DREG] = { .type = NLA_U32 },
- [NFTA_LOOKUP_FLAGS] = { .type = NLA_U32 },
+ [NFTA_LOOKUP_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NFT_LOOKUP_F_INV),
};
static int nft_lookup_init(const struct nft_ctx *ctx,
@@ -76,25 +150,16 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_EVAL)
- return -EOPNOTSUPP;
-
- priv->sreg = nft_parse_register(tb[NFTA_LOOKUP_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
if (tb[NFTA_LOOKUP_FLAGS]) {
flags = ntohl(nla_get_be32(tb[NFTA_LOOKUP_FLAGS]));
- if (flags & ~NFT_LOOKUP_F_INV)
- return -EINVAL;
-
- if (flags & NFT_LOOKUP_F_INV) {
- if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ if (flags & NFT_LOOKUP_F_INV)
priv->invert = true;
- }
}
if (tb[NFTA_LOOKUP_DREG] != NULL) {
@@ -103,13 +168,23 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_MAP))
return -EINVAL;
- priv->dreg = nft_parse_register(tb[NFTA_LOOKUP_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- set->dtype, set->dlen);
+ err = nft_parse_register_store(ctx, tb[NFTA_LOOKUP_DREG],
+ &priv->dreg, NULL,
+ nft_set_datatype(set),
+ set->dlen);
if (err < 0)
return err;
- } else if (set->flags & NFT_SET_MAP)
- return -EINVAL;
+ priv->dreg_set = true;
+ } else if (set->flags & NFT_SET_MAP) {
+ /* Map given, but user asks for lookup only (i.e. to
+ * ignore value assoicated with key).
+ *
+ * This makes no sense for anonymous maps since they are
+ * scoped to the rule, but for named sets this can be useful.
+ */
+ if (set->flags & NFT_SET_ANONYMOUS)
+ return -EINVAL;
+ }
priv->binding.flags = set->flags & NFT_SET_MAP;
@@ -121,20 +196,21 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
return 0;
}
-static void nft_lookup_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_lookup_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_lookup *priv = nft_expr_priv(expr);
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+ nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
}
-static void nft_lookup_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_lookup_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_lookup *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_lookup_destroy(const struct nft_ctx *ctx,
@@ -145,7 +221,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
@@ -154,7 +231,7 @@ static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_LOOKUP_SREG, priv->sreg))
goto nla_put_failure;
- if (priv->set->flags & NFT_SET_MAP)
+ if (priv->dreg_set)
if (nft_dump_register(skb, NFTA_LOOKUP_DREG, priv->dreg))
goto nla_put_failure;
if (nla_put_be32(skb, NFTA_LOOKUP_FLAGS, htonl(flags)))
@@ -165,61 +242,41 @@ nla_put_failure:
return -1;
}
-static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
- struct nft_set *set,
- const struct nft_set_iter *iter,
- struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
- struct nft_ctx *pctx = (struct nft_ctx *)ctx;
- const struct nft_data *data;
- int err;
-
- if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
- *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
- return 0;
-
- data = nft_set_ext_data(ext);
- switch (data->verdict.code) {
- case NFT_JUMP:
- case NFT_GOTO:
- pctx->level++;
- err = nft_chain_validate(ctx, data->verdict.chain);
- if (err < 0)
- return err;
- pctx->level--;
- break;
- default:
- break;
- }
-
- return 0;
-}
-
static int nft_lookup_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **d)
+ const struct nft_expr *expr)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
- struct nft_set_iter iter;
+ struct nft_set_iter iter = {
+ .genmask = nft_genmask_next(ctx->net),
+ .type = NFT_ITER_UPDATE,
+ .fn = nft_setelem_validate,
+ };
if (!(priv->set->flags & NFT_SET_MAP) ||
priv->set->dtype != NFT_DATA_VERDICT)
return 0;
- iter.genmask = nft_genmask_next(ctx->net);
- iter.skip = 0;
- iter.count = 0;
- iter.err = 0;
- iter.fn = nft_lookup_validate_setelem;
-
priv->set->ops->walk(ctx, priv->set, &iter);
+ if (!iter.err)
+ iter.err = nft_set_catchall_validate(ctx, priv->set);
+
if (iter.err < 0)
return iter.err;
return 0;
}
+static bool nft_lookup_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_lookup *priv = nft_expr_priv(expr);
+
+ if (priv->set->flags & NFT_SET_MAP)
+ nft_reg_track_cancel(track, priv->dreg, priv->set->dlen);
+
+ return false;
+}
+
static const struct nft_expr_ops nft_lookup_ops = {
.type = &nft_lookup_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)),
@@ -230,6 +287,7 @@ static const struct nft_expr_ops nft_lookup_ops = {
.destroy = nft_lookup_destroy,
.dump = nft_lookup_dump,
.validate = nft_lookup_validate,
+ .reduce = nft_lookup_reduce,
};
struct nft_expr_type nft_lookup_type __read_mostly = {
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9d8655bc1bea..868bd4d73555 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -14,18 +11,23 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nft_masq.h>
+#include <net/netfilter/nf_nat_masquerade.h>
-const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
- [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+struct nft_masq {
+ u32 flags;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
+};
+
+static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
+ [NFTA_MASQ_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
[NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 },
};
-EXPORT_SYMBOL_GPL(nft_masq_policy);
-int nft_masq_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_masq_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
int err;
@@ -36,36 +38,28 @@ int nft_masq_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_POST_ROUTING));
}
-EXPORT_SYMBOL_GPL(nft_masq_validate);
-int nft_masq_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_masq_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
- u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ u32 plen = sizeof_field(struct nf_nat_range, min_proto.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
- if (tb[NFTA_MASQ_FLAGS]) {
+ if (tb[NFTA_MASQ_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
@@ -75,9 +69,9 @@ int nft_masq_init(const struct nft_ctx *ctx,
return nf_ct_netns_get(ctx->net, ctx->family);
}
-EXPORT_SYMBOL_GPL(nft_masq_init);
-int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_masq_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_masq *priv = nft_expr_priv(expr);
@@ -98,7 +92,197 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_masq_dump);
+
+static void nft_masq_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_masq *priv = nft_expr_priv(expr);
+ struct nf_nat_range2 range;
+
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
+ if (priv->sreg_proto_min) {
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+ }
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb,
+ nft_hook(pkt),
+ &range,
+ nft_out(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
+ nft_out(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
+static struct nft_expr_type nft_masq_ipv4_type;
+static const struct nft_expr_ops nft_masq_ipv4_ops = {
+ .type = &nft_masq_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_eval,
+ .init = nft_masq_init,
+ .destroy = nft_masq_ipv4_destroy,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "masq",
+ .ops = &nft_masq_ipv4_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static void
+nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
+static struct nft_expr_type nft_masq_ipv6_type;
+static const struct nft_expr_ops nft_masq_ipv6_ops = {
+ .type = &nft_masq_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_eval,
+ .init = nft_masq_init,
+ .destroy = nft_masq_ipv6_destroy,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "masq",
+ .ops = &nft_masq_ipv6_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_module_init_ipv6(void)
+{
+ return nft_register_expr(&nft_masq_ipv6_type);
+}
+
+static void nft_masq_module_exit_ipv6(void)
+{
+ nft_unregister_expr(&nft_masq_ipv6_type);
+}
+#else
+static inline int nft_masq_module_init_ipv6(void) { return 0; }
+static inline void nft_masq_module_exit_ipv6(void) {}
+#endif
+
+#ifdef CONFIG_NF_TABLES_INET
+static void
+nft_masq_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_INET);
+}
+
+static struct nft_expr_type nft_masq_inet_type;
+static const struct nft_expr_ops nft_masq_inet_ops = {
+ .type = &nft_masq_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
+ .eval = nft_masq_eval,
+ .init = nft_masq_init,
+ .destroy = nft_masq_inet_destroy,
+ .dump = nft_masq_dump,
+ .validate = nft_masq_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_masq_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "masq",
+ .ops = &nft_masq_inet_ops,
+ .policy = nft_masq_policy,
+ .maxattr = NFTA_MASQ_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_masq_module_init_inet(void)
+{
+ return nft_register_expr(&nft_masq_inet_type);
+}
+
+static void nft_masq_module_exit_inet(void)
+{
+ nft_unregister_expr(&nft_masq_inet_type);
+}
+#else
+static inline int nft_masq_module_init_inet(void) { return 0; }
+static inline void nft_masq_module_exit_inet(void) {}
+#endif
+
+static int __init nft_masq_module_init(void)
+{
+ int ret;
+
+ ret = nft_masq_module_init_ipv6();
+ if (ret < 0)
+ return ret;
+
+ ret = nft_masq_module_init_inet();
+ if (ret < 0) {
+ nft_masq_module_exit_ipv6();
+ return ret;
+ }
+
+ ret = nft_register_expr(&nft_masq_ipv4_type);
+ if (ret < 0) {
+ nft_masq_module_exit_inet();
+ nft_masq_module_exit_ipv6();
+ return ret;
+ }
+
+ ret = nf_nat_masquerade_inet_register_notifiers();
+ if (ret < 0) {
+ nft_masq_module_exit_ipv6();
+ nft_masq_module_exit_inet();
+ nft_unregister_expr(&nft_masq_ipv4_type);
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit nft_masq_module_exit(void)
+{
+ nft_masq_module_exit_ipv6();
+ nft_masq_module_exit_inet();
+ nft_unregister_expr(&nft_masq_ipv4_type);
+ nf_nat_masquerade_inet_unregister_notifiers();
+}
+
+module_init(nft_masq_module_init);
+module_exit(nft_masq_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
+MODULE_ALIAS_NFT_EXPR("masq");
+MODULE_DESCRIPTION("Netfilter nftables masquerade expression support");
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 6df486c5ebd3..05cd1e6e6a2f 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2014 Intel Corporation
* Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -17,42 +14,305 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/random.h>
#include <linux/smp.h>
#include <linux/static_key.h>
#include <net/dst.h>
+#include <net/ip.h>
#include <net/sock.h>
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nft_meta.h>
+#include <net/netfilter/nf_tables_offload.h>
#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
-struct nft_meta {
- enum nft_meta_keys key:8;
- union {
- enum nft_registers dreg:8;
- enum nft_registers sreg:8;
- };
-};
+#define NFT_META_SECS_PER_MINUTE 60
+#define NFT_META_SECS_PER_HOUR 3600
+#define NFT_META_SECS_PER_DAY 86400
+#define NFT_META_DAYS_PER_WEEK 7
+
+static u8 nft_meta_weekday(void)
+{
+ time64_t secs = ktime_get_real_seconds();
+ unsigned int dse;
+ u8 wday;
+
+ secs -= NFT_META_SECS_PER_MINUTE * sys_tz.tz_minuteswest;
+ dse = div_u64(secs, NFT_META_SECS_PER_DAY);
+ wday = (4 + dse) % NFT_META_DAYS_PER_WEEK;
+
+ return wday;
+}
+
+static u32 nft_meta_hour(time64_t secs)
+{
+ struct tm tm;
+
+ time64_to_tm(secs, 0, &tm);
+
+ return tm.tm_hour * NFT_META_SECS_PER_HOUR
+ + tm.tm_min * NFT_META_SECS_PER_MINUTE
+ + tm.tm_sec;
+}
+
+static noinline_for_stack void
+nft_meta_get_eval_time(enum nft_meta_keys key,
+ u32 *dest)
+{
+ switch (key) {
+ case NFT_META_TIME_NS:
+ nft_reg_store64((u64 *)dest, ktime_get_real_ns());
+ break;
+ case NFT_META_TIME_DAY:
+ nft_reg_store8(dest, nft_meta_weekday());
+ break;
+ case NFT_META_TIME_HOUR:
+ *dest = nft_meta_hour(ktime_get_real_seconds());
+ break;
+ default:
+ break;
+ }
+}
+
+static noinline bool
+nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt,
+ u32 *dest)
+{
+ const struct sk_buff *skb = pkt->skb;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ if (ipv4_is_multicast(ip_hdr(skb)->daddr))
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ else
+ nft_reg_store8(dest, PACKET_BROADCAST);
+ break;
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ case NFPROTO_NETDEV:
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ int noff = skb_network_offset(skb);
+ struct iphdr *iph, _iph;
+
+ iph = skb_header_pointer(skb, noff,
+ sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
-static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+ if (ipv4_is_multicast(iph->daddr))
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ else
+ nft_reg_store8(dest, PACKET_BROADCAST);
-#ifdef CONFIG_NF_TABLES_BRIDGE
-#include "../bridge/br_private.h"
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ nft_reg_store8(dest, PACKET_MULTICAST);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ return true;
+}
+
+static noinline bool
+nft_meta_get_eval_skugid(enum nft_meta_keys key,
+ u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ struct sock *sk = skb_to_full_sk(pkt->skb);
+ struct socket *sock;
+
+ if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+ return false;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ sock = sk->sk_socket;
+ if (!sock || !sock->file) {
+ read_unlock_bh(&sk->sk_callback_lock);
+ return false;
+ }
+
+ switch (key) {
+ case NFT_META_SKUID:
+ *dest = from_kuid_munged(sock_net(sk)->user_ns,
+ sock->file->f_cred->fsuid);
+ break;
+ case NFT_META_SKGID:
+ *dest = from_kgid_munged(sock_net(sk)->user_ns,
+ sock->file->f_cred->fsgid);
+ break;
+ default:
+ break;
+ }
+
+ read_unlock_bh(&sk->sk_callback_lock);
+ return true;
+}
+
+#ifdef CONFIG_CGROUP_NET_CLASSID
+static noinline bool
+nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt)
+{
+ struct sock *sk = skb_to_full_sk(pkt->skb);
+
+ if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk)))
+ return false;
+
+ *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
+ return true;
+}
#endif
+static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key,
+ u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+
+ switch (key) {
+ case NFT_META_IIFKIND:
+ if (!in || !in->rtnl_link_ops)
+ return false;
+ strscpy_pad((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ);
+ break;
+ case NFT_META_OIFKIND:
+ if (!out || !out->rtnl_link_ops)
+ return false;
+ strscpy_pad((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev)
+{
+ *dest = dev ? dev->ifindex : 0;
+}
+
+static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev)
+{
+ strscpy_pad((char *)dest, dev ? dev->name : "", IFNAMSIZ);
+}
+
+static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev)
+{
+ if (!dev)
+ return false;
+
+ nft_reg_store16(dest, dev->type);
+ return true;
+}
+
+static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev)
+{
+ if (!dev)
+ return false;
+
+ *dest = dev->group;
+ return true;
+}
+
+static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest,
+ const struct nft_pktinfo *pkt)
+{
+ switch (key) {
+ case NFT_META_IIFNAME:
+ nft_meta_store_ifname(dest, nft_in(pkt));
+ break;
+ case NFT_META_OIFNAME:
+ nft_meta_store_ifname(dest, nft_out(pkt));
+ break;
+ case NFT_META_IIF:
+ nft_meta_store_ifindex(dest, nft_in(pkt));
+ break;
+ case NFT_META_OIF:
+ nft_meta_store_ifindex(dest, nft_out(pkt));
+ break;
+ case NFT_META_IFTYPE:
+ if (!nft_meta_store_iftype(dest, pkt->skb->dev))
+ return false;
+ break;
+ case __NFT_META_IIFTYPE:
+ if (!nft_meta_store_iftype(dest, nft_in(pkt)))
+ return false;
+ break;
+ case NFT_META_OIFTYPE:
+ if (!nft_meta_store_iftype(dest, nft_out(pkt)))
+ return false;
+ break;
+ case NFT_META_IIFGROUP:
+ if (!nft_meta_store_ifgroup(dest, nft_in(pkt)))
+ return false;
+ break;
+ case NFT_META_OIFGROUP:
+ if (!nft_meta_store_ifgroup(dest, nft_out(pkt)))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+static noinline bool
+nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest)
+{
+ const struct dst_entry *dst = skb_dst(skb);
+
+ if (!dst)
+ return false;
+
+ *dest = dst->tclassid;
+ return true;
+}
+#endif
+
+static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ return inet_sdif(pkt->skb);
+ case NFPROTO_IPV6:
+ return inet6_sdif(pkt->skb);
+ }
+
+ return 0;
+}
+
+static noinline void
+nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt)
+{
+ u32 sdif = nft_meta_get_eval_sdif(pkt);
+ const struct net_device *dev;
+
+ dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL;
+ nft_meta_store_ifname(dest, dev);
+}
+
void nft_meta_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
- const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
- struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
-#ifdef CONFIG_NF_TABLES_BRIDGE
- const struct net_bridge_port *p;
-#endif
switch (priv->key) {
case NFT_META_LEN:
@@ -65,7 +325,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_reg_store8(dest, nft_pf(pkt));
break;
case NFT_META_L4PROTO:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
goto err;
nft_reg_store8(dest, pkt->tprot);
break;
@@ -76,77 +336,26 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*dest = skb->mark;
break;
case NFT_META_IIF:
- if (in == NULL)
- goto err;
- *dest = in->ifindex;
- break;
case NFT_META_OIF:
- if (out == NULL)
- goto err;
- *dest = out->ifindex;
- break;
case NFT_META_IIFNAME:
- if (in == NULL)
- goto err;
- strncpy((char *)dest, in->name, IFNAMSIZ);
- break;
case NFT_META_OIFNAME:
- if (out == NULL)
- goto err;
- strncpy((char *)dest, out->name, IFNAMSIZ);
- break;
case NFT_META_IIFTYPE:
- if (in == NULL)
- goto err;
- nft_reg_store16(dest, in->type);
- break;
case NFT_META_OIFTYPE:
- if (out == NULL)
+ case NFT_META_IIFGROUP:
+ case NFT_META_OIFGROUP:
+ if (!nft_meta_get_eval_ifname(priv->key, dest, pkt))
goto err;
- nft_reg_store16(dest, out->type);
break;
case NFT_META_SKUID:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
- goto err;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket == NULL ||
- sk->sk_socket->file == NULL) {
- read_unlock_bh(&sk->sk_callback_lock);
- goto err;
- }
-
- *dest = from_kuid_munged(&init_user_ns,
- sk->sk_socket->file->f_cred->fsuid);
- read_unlock_bh(&sk->sk_callback_lock);
- break;
case NFT_META_SKGID:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
- goto err;
-
- read_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_socket == NULL ||
- sk->sk_socket->file == NULL) {
- read_unlock_bh(&sk->sk_callback_lock);
+ if (!nft_meta_get_eval_skugid(priv->key, dest, pkt))
goto err;
- }
- *dest = from_kgid_munged(&init_user_ns,
- sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&sk->sk_callback_lock);
break;
#ifdef CONFIG_IP_ROUTE_CLASSID
- case NFT_META_RTCLASSID: {
- const struct dst_entry *dst = skb_dst(skb);
-
- if (dst == NULL)
+ case NFT_META_RTCLASSID:
+ if (!nft_meta_get_eval_rtclassid(skb, dest))
goto err;
- *dest = dst->tclassid;
break;
- }
#endif
#ifdef CONFIG_NETWORK_SECMARK
case NFT_META_SECMARK:
@@ -159,91 +368,42 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
}
- switch (nft_pf(pkt)) {
- case NFPROTO_IPV4:
- if (ipv4_is_multicast(ip_hdr(skb)->daddr))
- nft_reg_store8(dest, PACKET_MULTICAST);
- else
- nft_reg_store8(dest, PACKET_BROADCAST);
- break;
- case NFPROTO_IPV6:
- nft_reg_store8(dest, PACKET_MULTICAST);
- break;
- case NFPROTO_NETDEV:
- switch (skb->protocol) {
- case htons(ETH_P_IP): {
- int noff = skb_network_offset(skb);
- struct iphdr *iph, _iph;
-
- iph = skb_header_pointer(skb, noff,
- sizeof(_iph), &_iph);
- if (!iph)
- goto err;
-
- if (ipv4_is_multicast(iph->daddr))
- nft_reg_store8(dest, PACKET_MULTICAST);
- else
- nft_reg_store8(dest, PACKET_BROADCAST);
-
- break;
- }
- case htons(ETH_P_IPV6):
- nft_reg_store8(dest, PACKET_MULTICAST);
- break;
- default:
- WARN_ON_ONCE(1);
- goto err;
- }
- break;
- default:
- WARN_ON_ONCE(1);
+ if (!nft_meta_get_eval_pkttype_lo(pkt, dest))
goto err;
- }
break;
case NFT_META_CPU:
*dest = raw_smp_processor_id();
break;
- case NFT_META_IIFGROUP:
- if (in == NULL)
- goto err;
- *dest = in->group;
- break;
- case NFT_META_OIFGROUP:
- if (out == NULL)
- goto err;
- *dest = out->group;
- break;
#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
- sk = skb_to_full_sk(skb);
- if (!sk || !sk_fullsock(sk) ||
- !net_eq(nft_net(pkt), sock_net(sk)))
+ if (!nft_meta_get_eval_cgroup(dest, pkt))
goto err;
- *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
#endif
- case NFT_META_PRANDOM: {
- struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
- *dest = prandom_u32_state(state);
+ case NFT_META_PRANDOM:
+ *dest = get_random_u32();
break;
- }
#ifdef CONFIG_XFRM
case NFT_META_SECPATH:
nft_reg_store8(dest, secpath_exists(skb));
break;
#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
- case NFT_META_BRI_IIFNAME:
- if (in == NULL || (p = br_port_get_rcu(in)) == NULL)
- goto err;
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
- case NFT_META_BRI_OIFNAME:
- if (out == NULL || (p = br_port_get_rcu(out)) == NULL)
+ case NFT_META_IIFKIND:
+ case NFT_META_OIFKIND:
+ if (!nft_meta_get_eval_kind(priv->key, dest, pkt))
goto err;
- strncpy((char *)dest, p->br->dev->name, IFNAMSIZ);
- return;
-#endif
+ break;
+ case NFT_META_TIME_NS:
+ case NFT_META_TIME_DAY:
+ case NFT_META_TIME_HOUR:
+ nft_meta_get_eval_time(priv->key, dest);
+ break;
+ case NFT_META_SDIF:
+ *dest = nft_meta_get_eval_sdif(pkt);
+ break;
+ case NFT_META_SDIFNAME:
+ nft_meta_get_eval_sdifname(dest, pkt);
+ break;
default:
WARN_ON(1);
goto err;
@@ -253,10 +413,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
err:
regs->verdict.code = NFT_BREAK;
}
+EXPORT_SYMBOL_GPL(nft_meta_get_eval);
-static void nft_meta_set_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_meta_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *meta = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
@@ -293,16 +454,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr,
WARN_ON(1);
}
}
+EXPORT_SYMBOL_GPL(nft_meta_set_eval);
-static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
+const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = {
[NFTA_META_DREG] = { .type = NLA_U32 },
- [NFTA_META_KEY] = { .type = NLA_U32 },
+ [NFTA_META_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_META_SREG] = { .type = NLA_U32 },
};
+EXPORT_SYMBOL_GPL(nft_meta_policy);
-static int nft_meta_get_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+int nft_meta_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -321,6 +484,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_MARK:
case NFT_META_IIF:
case NFT_META_OIF:
+ case NFT_META_SDIF:
case NFT_META_SKUID:
case NFT_META_SKGID:
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -340,10 +504,12 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
break;
case NFT_META_IIFNAME:
case NFT_META_OIFNAME:
+ case NFT_META_IIFKIND:
+ case NFT_META_OIFKIND:
+ case NFT_META_SDIFNAME:
len = IFNAMSIZ;
break;
case NFT_META_PRANDOM:
- prandom_init_once(&nft_prandom_state);
len = sizeof(u32);
break;
#ifdef CONFIG_XFRM
@@ -351,33 +517,47 @@ static int nft_meta_get_init(const struct nft_ctx *ctx,
len = sizeof(u8);
break;
#endif
-#ifdef CONFIG_NF_TABLES_BRIDGE
- case NFT_META_BRI_IIFNAME:
- case NFT_META_BRI_OIFNAME:
- if (ctx->family != NFPROTO_BRIDGE)
- return -EOPNOTSUPP;
- len = IFNAMSIZ;
+ case NFT_META_TIME_NS:
+ len = sizeof(u64);
+ break;
+ case NFT_META_TIME_DAY:
+ len = sizeof(u8);
+ break;
+ case NFT_META_TIME_HOUR:
+ len = sizeof(u32);
break;
-#endif
default:
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_META_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
+EXPORT_SYMBOL_GPL(nft_meta_get_init);
-static int nft_meta_get_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx)
{
-#ifdef CONFIG_XFRM
- const struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
- if (priv->key != NFT_META_SECPATH)
- return 0;
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx)
+{
+#ifdef CONFIG_XFRM
+ unsigned int hooks;
switch (ctx->family) {
case NFPROTO_NETDEV:
@@ -400,9 +580,26 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx,
#endif
}
-static int nft_meta_set_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_meta_get_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ switch (priv->key) {
+ case NFT_META_SECPATH:
+ return nft_meta_get_validate_xfrm(ctx);
+ case NFT_META_SDIF:
+ case NFT_META_SDIFNAME:
+ return nft_meta_get_validate_sdif(ctx);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int nft_meta_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int hooks;
@@ -428,10 +625,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx,
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_validate);
-static int nft_meta_set_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+int nft_meta_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_meta *priv = nft_expr_priv(expr);
unsigned int len;
@@ -456,8 +654,8 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->sreg = nft_parse_register(tb[NFTA_META_SREG]);
- err = nft_validate_register_load(priv->sreg, len);
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len);
if (err < 0)
return err;
@@ -466,9 +664,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx,
return 0;
}
+EXPORT_SYMBOL_GPL(nft_meta_set_init);
-static int nft_meta_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+int nft_meta_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -481,8 +680,10 @@ static int nft_meta_get_dump(struct sk_buff *skb,
nla_put_failure:
return -1;
}
+EXPORT_SYMBOL_GPL(nft_meta_get_dump);
-static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_meta *priv = nft_expr_priv(expr);
@@ -496,15 +697,75 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
nla_put_failure:
return -1;
}
+EXPORT_SYMBOL_GPL(nft_meta_set_dump);
-static void nft_meta_set_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+void nft_meta_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
const struct nft_meta *priv = nft_expr_priv(expr);
if (priv->key == NFT_META_NFTRACE)
static_branch_dec(&nft_trace_enabled);
}
+EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
+
+static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto,
+ sizeof(__u16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case NFT_META_L4PROTO:
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ case NFT_META_IIF:
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_ifindex, sizeof(__u32), reg);
+ break;
+ case NFT_META_IIFTYPE:
+ NFT_OFFLOAD_MATCH_EXACT(FLOW_DISSECTOR_KEY_META, meta,
+ ingress_iftype, sizeof(__u16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+bool nft_meta_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct nft_meta *meta;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ meta = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != meta->key ||
+ priv->dreg != meta->dreg) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+EXPORT_SYMBOL_GPL(nft_meta_get_reduce);
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
@@ -512,9 +773,29 @@ static const struct nft_expr_ops nft_meta_get_ops = {
.eval = nft_meta_get_eval,
.init = nft_meta_get_init,
.dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
.validate = nft_meta_get_validate,
+ .offload = nft_meta_get_offload,
};
+static bool nft_meta_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_meta_set_ops = {
.type = &nft_meta_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
@@ -522,6 +803,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.init = nft_meta_set_init,
.destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
+ .reduce = nft_meta_set_reduce,
.validate = nft_meta_set_validate,
};
@@ -535,6 +817,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
return ERR_PTR(-EINVAL);
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) && IS_MODULE(CONFIG_NFT_BRIDGE_META)
+ if (ctx->family == NFPROTO_BRIDGE)
+ return ERR_PTR(-EAGAIN);
+#endif
if (tb[NFTA_META_DREG])
return &nft_meta_get_ops;
@@ -544,9 +830,74 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
}
+static int nft_meta_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ if (!tb[NFTA_META_KEY] || !tb[NFTA_META_DREG])
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ len = sizeof(u16);
+ break;
+ case NFT_META_L4PROTO:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ priv->len = len;
+
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+void nft_meta_inner_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+
+ switch (priv->key) {
+ case NFT_META_PROTOCOL:
+ nft_reg_store16(dest, (__force u16)tun_ctx->llproto);
+ break;
+ case NFT_META_L4PROTO:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ nft_reg_store8(dest, tun_ctx->l4proto);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ return;
+
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_inner_eval);
+
+static const struct nft_expr_ops nft_meta_inner_ops = {
+ .type = &nft_meta_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .init = nft_meta_inner_init,
+ .dump = nft_meta_get_dump,
+ /* direct call to nft_meta_inner_eval(). */
+};
+
struct nft_expr_type nft_meta_type __read_mostly = {
.name = "meta",
.select_ops = nft_meta_select_ops,
+ .inner_ops = &nft_meta_inner_ops,
.policy = nft_meta_policy,
.maxattr = NFTA_META_MAX,
.owner = THIS_MODULE,
@@ -601,7 +952,7 @@ static int nft_secmark_obj_init(const struct nft_ctx *ctx,
if (tb[NFTA_SECMARK_CTX] == NULL)
return -EINVAL;
- priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL);
+ priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT);
if (!priv->ctx)
return -ENOMEM;
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index c15807d10b91..6e21f72c5b57 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
* Copyright (c) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
@@ -21,21 +17,89 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat_l3proto.h>
#include <net/ip.h>
struct nft_nat {
- enum nft_registers sreg_addr_min:8;
- enum nft_registers sreg_addr_max:8;
- enum nft_registers sreg_proto_min:8;
- enum nft_registers sreg_proto_max:8;
+ u8 sreg_addr_min;
+ u8 sreg_addr_max;
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
enum nf_nat_manip_type type:8;
u8 family;
u16 flags;
};
+static void nft_nat_setup_addr(struct nf_nat_range2 *range,
+ const struct nft_regs *regs,
+ const struct nft_nat *priv)
+{
+ switch (priv->family) {
+ case AF_INET:
+ range->min_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_min];
+ range->max_addr.ip = (__force __be32)
+ regs->data[priv->sreg_addr_max];
+ break;
+ case AF_INET6:
+ memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min],
+ sizeof(range->min_addr.ip6));
+ memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max],
+ sizeof(range->max_addr.ip6));
+ break;
+ }
+}
+
+static void nft_nat_setup_proto(struct nf_nat_range2 *range,
+ const struct nft_regs *regs,
+ const struct nft_nat *priv)
+{
+ range->min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range->max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+}
+
+static void nft_nat_setup_netmap(struct nf_nat_range2 *range,
+ const struct nft_pktinfo *pkt,
+ const struct nft_nat *priv)
+{
+ struct sk_buff *skb = pkt->skb;
+ union nf_inet_addr new_addr;
+ __be32 netmask;
+ int i, len = 0;
+
+ switch (priv->type) {
+ case NFT_NAT_SNAT:
+ if (nft_pf(pkt) == NFPROTO_IPV4) {
+ new_addr.ip = ip_hdr(skb)->saddr;
+ len = sizeof(struct in_addr);
+ } else {
+ new_addr.in6 = ipv6_hdr(skb)->saddr;
+ len = sizeof(struct in6_addr);
+ }
+ break;
+ case NFT_NAT_DNAT:
+ if (nft_pf(pkt) == NFPROTO_IPV4) {
+ new_addr.ip = ip_hdr(skb)->daddr;
+ len = sizeof(struct in_addr);
+ } else {
+ new_addr.in6 = ipv6_hdr(skb)->daddr;
+ len = sizeof(struct in6_addr);
+ }
+ break;
+ }
+
+ for (i = 0; i < len / sizeof(__be32); i++) {
+ netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]);
+ new_addr.ip6[i] &= ~netmask;
+ new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask;
+ }
+
+ range->min_addr = new_addr;
+ range->max_addr = new_addr;
+}
+
static void nft_nat_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -46,33 +110,17 @@ static void nft_nat_eval(const struct nft_expr *expr,
struct nf_nat_range2 range;
memset(&range, 0, sizeof(range));
- if (priv->sreg_addr_min) {
- if (priv->family == AF_INET) {
- range.min_addr.ip = (__force __be32)
- regs->data[priv->sreg_addr_min];
- range.max_addr.ip = (__force __be32)
- regs->data[priv->sreg_addr_max];
- } else {
- memcpy(range.min_addr.ip6,
- &regs->data[priv->sreg_addr_min],
- sizeof(range.min_addr.ip6));
- memcpy(range.max_addr.ip6,
- &regs->data[priv->sreg_addr_max],
- sizeof(range.max_addr.ip6));
- }
- range.flags |= NF_NAT_RANGE_MAP_IPS;
+ if (priv->sreg_addr_min) {
+ nft_nat_setup_addr(&range, regs, priv);
+ if (priv->flags & NF_NAT_RANGE_NETMAP)
+ nft_nat_setup_netmap(&range, pkt, priv);
}
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
+ if (priv->sreg_proto_min)
+ nft_nat_setup_proto(&range, regs, priv);
- range.flags |= priv->flags;
+ range.flags = priv->flags;
regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type);
}
@@ -84,16 +132,21 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
[NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_NAT_FLAGS] = { .type = NLA_U32 },
+ [NFTA_NAT_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
static int nft_nat_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
struct nft_nat *priv = nft_expr_priv(expr);
int err;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
if (err < 0)
return err;
@@ -135,80 +188,77 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
priv->type = NF_NAT_MANIP_DST;
break;
default:
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (tb[NFTA_NAT_FAMILY] == NULL)
return -EINVAL;
family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
- if (family != ctx->family)
+ if (ctx->family != NFPROTO_INET && ctx->family != family)
return -EOPNOTSUPP;
switch (family) {
case NFPROTO_IPV4:
- alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip);
+ alen = sizeof_field(struct nf_nat_range, min_addr.ip);
break;
case NFPROTO_IPV6:
- alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6);
+ alen = sizeof_field(struct nf_nat_range, min_addr.ip6);
break;
default:
- return -EAFNOSUPPORT;
+ if (tb[NFTA_NAT_REG_ADDR_MIN])
+ return -EAFNOSUPPORT;
+ break;
}
priv->family = family;
if (tb[NFTA_NAT_REG_ADDR_MIN]) {
- priv->sreg_addr_min =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MIN]);
- err = nft_validate_register_load(priv->sreg_addr_min, alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN],
+ &priv->sreg_addr_min, alen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_ADDR_MAX]) {
- priv->sreg_addr_max =
- nft_parse_register(tb[NFTA_NAT_REG_ADDR_MAX]);
-
- err = nft_validate_register_load(priv->sreg_addr_max,
- alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX],
+ &priv->sreg_addr_max,
+ alen);
if (err < 0)
return err;
} else {
priv->sreg_addr_max = priv->sreg_addr_min;
}
+
+ priv->flags |= NF_NAT_RANGE_MAP_IPS;
}
- plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_NAT_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_NAT_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_NAT_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
priv->sreg_proto_max = priv->sreg_proto_min;
}
- }
- if (tb[NFTA_NAT_FLAGS]) {
- priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
+ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
+ if (tb[NFTA_NAT_FLAGS])
+ priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS]));
+
return nf_ct_netns_get(ctx->net, family);
}
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_nat *priv = nft_expr_priv(expr);
@@ -270,6 +320,7 @@ static const struct nft_expr_ops nft_nat_ops = {
.destroy = nft_nat_destroy,
.dump = nft_nat_dump,
.validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_nat_type __read_mostly = {
@@ -280,13 +331,69 @@ static struct nft_expr_type nft_nat_type __read_mostly = {
.owner = THIS_MODULE,
};
+#ifdef CONFIG_NF_TABLES_INET
+static void nft_nat_inet_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_nat *priv = nft_expr_priv(expr);
+
+ if (priv->family == nft_pf(pkt) ||
+ priv->family == NFPROTO_INET)
+ nft_nat_eval(expr, regs, pkt);
+}
+
+static const struct nft_expr_ops nft_nat_inet_ops = {
+ .type = &nft_nat_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+ .eval = nft_nat_inet_eval,
+ .init = nft_nat_init,
+ .destroy = nft_nat_destroy,
+ .dump = nft_nat_dump,
+ .validate = nft_nat_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_inet_nat_type __read_mostly = {
+ .name = "nat",
+ .family = NFPROTO_INET,
+ .ops = &nft_nat_inet_ops,
+ .policy = nft_nat_policy,
+ .maxattr = NFTA_NAT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int nft_nat_inet_module_init(void)
+{
+ return nft_register_expr(&nft_inet_nat_type);
+}
+
+static void nft_nat_inet_module_exit(void)
+{
+ nft_unregister_expr(&nft_inet_nat_type);
+}
+#else
+static int nft_nat_inet_module_init(void) { return 0; }
+static void nft_nat_inet_module_exit(void) { }
+#endif
+
static int __init nft_nat_module_init(void)
{
- return nft_register_expr(&nft_nat_type);
+ int ret = nft_nat_inet_module_init();
+
+ if (ret)
+ return ret;
+
+ ret = nft_register_expr(&nft_nat_type);
+ if (ret)
+ nft_nat_inet_module_exit();
+
+ return ret;
}
static void __exit nft_nat_module_exit(void)
{
+ nft_nat_inet_module_exit();
nft_unregister_expr(&nft_nat_type);
}
@@ -296,3 +403,4 @@ module_exit(nft_nat_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
MODULE_ALIAS_NFT_EXPR("nat");
+MODULE_DESCRIPTION("Network Address Translation support");
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 3cc1b3dc3c3c..bd058babfc82 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016 Laura Garcia <nevola@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/kernel.h>
@@ -13,16 +9,15 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
+#include <linux/random.h>
#include <linux/static_key.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
-static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state);
-
struct nft_ng_inc {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
- atomic_t counter;
+ atomic_t *counter;
u32 offset;
};
@@ -31,9 +26,9 @@ static u32 nft_ng_inc_gen(struct nft_ng_inc *priv)
u32 nval, oval;
do {
- oval = atomic_read(&priv->counter);
+ oval = atomic_read(priv->counter);
nval = (oval + 1 < priv->modulus) ? oval + 1 : 0;
- } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval);
+ } while (atomic_cmpxchg(priv->counter, oval, nval) != oval);
return nval + priv->offset;
}
@@ -59,6 +54,7 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_ng_inc *priv = nft_expr_priv(expr);
+ int err;
if (tb[NFTA_NG_OFFSET])
priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET]));
@@ -70,11 +66,32 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
- atomic_set(&priv->counter, priv->modulus - 1);
+ priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT);
+ if (!priv->counter)
+ return -ENOMEM;
+
+ atomic_set(priv->counter, priv->modulus - 1);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ err = nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
+ if (err < 0)
+ goto err;
+
+ return 0;
+err:
+ kfree(priv->counter);
+
+ return err;
+}
+
+static bool nft_ng_inc_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
}
static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg,
@@ -95,7 +112,8 @@ nla_put_failure:
return -1;
}
-static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_inc_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_inc *priv = nft_expr_priv(expr);
@@ -103,18 +121,23 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static void nft_ng_inc_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_inc *priv = nft_expr_priv(expr);
+
+ kfree(priv->counter);
+}
+
struct nft_ng_random {
- enum nft_registers dreg:8;
+ u8 dreg;
u32 modulus;
u32 offset;
};
-static u32 nft_ng_random_gen(struct nft_ng_random *priv)
+static u32 nft_ng_random_gen(const struct nft_ng_random *priv)
{
- struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state);
-
- return reciprocal_scale(prandom_u32_state(state), priv->modulus) +
- priv->offset;
+ return reciprocal_scale(get_random_u32(), priv->modulus) + priv->offset;
}
static void nft_ng_random_eval(const struct nft_expr *expr,
@@ -142,15 +165,12 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
if (priv->offset + priv->modulus - 1 < priv->offset)
return -EOVERFLOW;
- prandom_init_once(&nft_numgen_prandom_state);
-
- priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]);
-
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, sizeof(u32));
+ return nft_parse_register_store(ctx, tb[NFTA_NG_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, sizeof(u32));
}
-static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_random_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_ng_random *priv = nft_expr_priv(expr);
@@ -158,13 +178,25 @@ static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
priv->offset);
}
+static bool nft_ng_random_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_ng_random *priv = nft_expr_priv(expr);
+
+ nft_reg_track_cancel(track, priv->dreg, NFT_REG32_SIZE);
+
+ return false;
+}
+
static struct nft_expr_type nft_ng_type;
static const struct nft_expr_ops nft_ng_inc_ops = {
.type = &nft_ng_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)),
.eval = nft_ng_inc_eval,
.init = nft_ng_inc_init,
+ .destroy = nft_ng_inc_destroy,
.dump = nft_ng_inc_dump,
+ .reduce = nft_ng_inc_reduce,
};
static const struct nft_expr_ops nft_ng_random_ops = {
@@ -173,6 +205,7 @@ static const struct nft_expr_ops nft_ng_random_ops = {
.eval = nft_ng_random_eval,
.init = nft_ng_random_init,
.dump = nft_ng_random_dump,
+ .reduce = nft_ng_random_reduce,
};
static const struct nft_expr_ops *
@@ -221,3 +254,4 @@ module_exit(nft_ng_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>");
MODULE_ALIAS_NFT_EXPR("numgen");
+MODULE_DESCRIPTION("nftables number generator module");
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index a3185ca2a3a9..1a62e384766a 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2012-2016 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
*/
#include <linux/init.h>
@@ -12,19 +9,48 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#define nft_objref_priv(expr) *((struct nft_object **)nft_expr_priv(expr))
-static void nft_objref_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_object *obj = nft_objref_priv(expr);
obj->ops->eval(obj, regs, pkt);
}
+static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type)
+{
+ unsigned int hooks;
+
+ switch (type) {
+ case NFT_OBJECT_SYNPROXY:
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD);
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int nft_objref_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, obj->ops->type->type);
+}
+
static int nft_objref_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -38,22 +64,26 @@ static int nft_objref_init(const struct nft_ctx *ctx,
return -EINVAL;
objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
- obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype,
+ obj = nft_obj_lookup(ctx->net, ctx->table,
+ tb[NFTA_OBJREF_IMM_NAME], objtype,
genmask);
if (IS_ERR(obj))
return -ENOENT;
+ if (!nft_use_inc(&obj->use))
+ return -EMFILE;
+
nft_objref_priv(expr) = obj;
- obj->use++;
return 0;
}
-static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_object *obj = nft_objref_priv(expr);
- if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
+ if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->key.name) ||
nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE,
htonl(obj->ops->type->type)))
goto nla_put_failure;
@@ -64,45 +94,61 @@ nla_put_failure:
return -1;
}
-static void nft_objref_destroy(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_objref_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_object *obj = nft_objref_priv(expr);
- obj->use--;
+ if (phase == NFT_TRANS_COMMIT)
+ return;
+
+ nft_use_dec(&obj->use);
+}
+
+static void nft_objref_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_object *obj = nft_objref_priv(expr);
+
+ nft_use_inc_restore(&obj->use);
}
-static struct nft_expr_type nft_objref_type;
static const struct nft_expr_ops nft_objref_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_object *)),
.eval = nft_objref_eval,
.init = nft_objref_init,
- .destroy = nft_objref_destroy,
+ .activate = nft_objref_activate,
+ .deactivate = nft_objref_deactivate,
.dump = nft_objref_dump,
+ .validate = nft_objref_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_objref_map {
struct nft_set *set;
- enum nft_registers sreg:8;
+ u8 sreg;
struct nft_set_binding binding;
};
-static void nft_objref_map_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_objref_map_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
+ struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
struct nft_object *obj;
- bool found;
- found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
- &ext);
- if (!found) {
- regs->verdict.code = NFT_BREAK;
- return;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ if (!ext) {
+ ext = nft_set_catchall_lookup(net, set);
+ if (!ext) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
}
obj = *nft_set_ext_obj(ext);
obj->ops->eval(obj, regs, pkt);
@@ -126,8 +172,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_OBJECT))
return -EINVAL;
- priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
- err = nft_validate_register_load(priv->sreg, set->klen);
+ err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg,
+ set->klen);
if (err < 0)
return err;
@@ -141,7 +187,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_map_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_objref_map *priv = nft_expr_priv(expr);
@@ -155,20 +202,21 @@ nla_put_failure:
return -1;
}
-static void nft_objref_map_activate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ enum nft_trans_phase phase)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- nf_tables_rebind_set(ctx, priv->set, &priv->binding);
+ nf_tables_deactivate_set(ctx, priv->set, &priv->binding, phase);
}
-static void nft_objref_map_deactivate(const struct nft_ctx *ctx,
- const struct nft_expr *expr)
+static void nft_objref_map_activate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_objref_map *priv = nft_expr_priv(expr);
- nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+ nf_tables_activate_set(ctx, priv->set);
}
static void nft_objref_map_destroy(const struct nft_ctx *ctx,
@@ -179,7 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
nf_tables_destroy_set(ctx, priv->set);
}
-static struct nft_expr_type nft_objref_type;
+static int nft_objref_map_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+ return nft_objref_validate_obj_type(ctx, priv->set->objtype);
+}
+
static const struct nft_expr_ops nft_objref_map_ops = {
.type = &nft_objref_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -189,6 +244,8 @@ static const struct nft_expr_ops nft_objref_map_ops = {
.deactivate = nft_objref_map_deactivate,
.destroy = nft_objref_map_destroy,
.dump = nft_objref_map_dump,
+ .validate = nft_objref_map_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -216,27 +273,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
[NFTA_OBJREF_SET_ID] = { .type = NLA_U32 },
};
-static struct nft_expr_type nft_objref_type __read_mostly = {
+struct nft_expr_type nft_objref_type __read_mostly = {
.name = "objref",
.select_ops = nft_objref_select_ops,
.policy = nft_objref_policy,
.maxattr = NFTA_OBJREF_MAX,
.owner = THIS_MODULE,
};
-
-static int __init nft_objref_module_init(void)
-{
- return nft_register_expr(&nft_objref_type);
-}
-
-static void __exit nft_objref_module_exit(void)
-{
- nft_unregister_expr(&nft_objref_type);
-}
-
-module_init(nft_objref_module_init);
-module_exit(nft_objref_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_EXPR("objref");
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index b13618c764ec..1c0b493ef0a9 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <net/ip.h>
#include <net/tcp.h>
@@ -5,13 +6,15 @@
#include <linux/netfilter/nfnetlink_osf.h>
struct nft_osf {
- enum nft_registers dreg:8;
+ u8 dreg;
u8 ttl;
+ u32 flags;
};
static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
[NFTA_OSF_DREG] = { .type = NLA_U32 },
[NFTA_OSF_TTL] = { .type = NLA_U8 },
+ [NFTA_OSF_FLAGS] = { .type = NLA_U32 },
};
static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
@@ -20,9 +23,15 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct nft_osf *priv = nft_expr_priv(expr);
u32 *dest = &regs->data[priv->dreg];
struct sk_buff *skb = pkt->skb;
+ char os_match[NFT_OSF_MAXGENRELEN];
const struct tcphdr *tcp;
+ struct nf_osf_data data;
struct tcphdr _tcph;
- const char *os_name;
+
+ if (pkt->tprot != IPPROTO_TCP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
tcp = skb_header_pointer(skb, ip_hdrlen(skb),
sizeof(struct tcphdr), &_tcph);
@@ -35,11 +44,17 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl);
- if (!os_name)
- strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
- else
- strncpy((char *)dest, os_name, NFT_OSF_MAXGENRELEN);
+ if (!nf_osf_find(skb, nf_osf_fingers, priv->ttl, &data)) {
+ strscpy_pad((char *)dest, "unknown", NFT_OSF_MAXGENRELEN);
+ } else {
+ if (priv->flags & NFT_OSF_F_VERSION)
+ snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s",
+ data.genre, data.version);
+ else
+ strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN);
+
+ strscpy_pad((char *)dest, os_match, NFT_OSF_MAXGENRELEN);
+ }
}
static int nft_osf_init(const struct nft_ctx *ctx,
@@ -47,9 +62,12 @@ static int nft_osf_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_osf *priv = nft_expr_priv(expr);
- int err;
+ u32 flags;
u8 ttl;
+ if (!tb[NFTA_OSF_DREG])
+ return -EINVAL;
+
if (tb[NFTA_OSF_TTL]) {
ttl = nla_get_u8(tb[NFTA_OSF_TTL]);
if (ttl > 2)
@@ -57,22 +75,29 @@ static int nft_osf_init(const struct nft_ctx *ctx,
priv->ttl = ttl;
}
- priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
- err = nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, NFT_OSF_MAXGENRELEN);
- if (err < 0)
- return err;
+ if (tb[NFTA_OSF_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_OSF_FLAGS]));
+ if (flags != NFT_OSF_F_VERSION)
+ return -EINVAL;
+ priv->flags = flags;
+ }
- return 0;
+ return nft_parse_register_store(ctx, tb[NFTA_OSF_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE,
+ NFT_OSF_MAXGENRELEN);
}
-static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_osf_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_osf *priv = nft_expr_priv(expr);
if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl))
goto nla_put_failure;
+ if (nla_put_u32(skb, NFTA_OSF_FLAGS, ntohl((__force __be32)priv->flags)))
+ goto nla_put_failure;
+
if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
goto nla_put_failure;
@@ -83,12 +108,47 @@ nla_put_failure:
}
static int nft_osf_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
- return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
- (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_FORWARD));
+ unsigned int hooks;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ hooks = (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_FORWARD);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static bool nft_osf_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ struct nft_osf *osf;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ osf = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->flags != osf->flags ||
+ priv->ttl != osf->ttl) {
+ nft_reg_track_update(track, expr, priv->dreg, NFT_OSF_MAXGENRELEN);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
}
static struct nft_expr_type nft_osf_type;
@@ -99,6 +159,7 @@ static const struct nft_expr_ops nft_osf_op = {
.dump = nft_osf_dump,
.type = &nft_osf_type,
.validate = nft_osf_validate,
+ .reduce = nft_osf_reduce,
};
static struct nft_expr_type nft_osf_type __read_mostly = {
@@ -125,3 +186,4 @@ module_exit(nft_osf_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
MODULE_ALIAS_NFT_EXPR("osf");
+MODULE_DESCRIPTION("nftables passive OS fingerprint support");
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index e110b0ebbf58..b0214418f75a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -18,27 +15,46 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
+#include <net/gre.h>
#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/sctp/checksum.h>
+
+static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
+ struct vlan_ethhdr *veth)
+{
+ if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN))
+ return false;
+
+ veth->h_vlan_proto = skb->vlan_proto;
+ veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth->h_vlan_encapsulated_proto = skb->protocol;
+
+ return true;
+}
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
-nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
- u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d;
+ u8 *vlanh, *dst_u8 = (u8 *) d;
struct vlan_ethhdr veth;
vlanh = (u8 *) &veth;
- if (offset < ETH_HLEN) {
- u8 ethlen = min_t(u8, len, ETH_HLEN - offset);
+ if (offset < VLAN_ETH_HLEN) {
+ u8 ethlen = len;
- if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN))
+ if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth))
return false;
- veth.h_vlan_proto = skb->vlan_proto;
+ if (offset + len > VLAN_ETH_HLEN)
+ ethlen -= offset + len - VLAN_ETH_HLEN;
memcpy(dst_u8, vlanh + offset, ethlen);
@@ -48,44 +64,117 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
dst_u8 += ethlen;
offset = ETH_HLEN;
- } else if (offset >= VLAN_ETH_HLEN) {
+ } else {
offset -= VLAN_HLEN;
- goto skip;
}
- veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
- veth.h_vlan_encapsulated_proto = skb->protocol;
+ return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
+}
+
+static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
+{
+ unsigned int thoff = nft_thoff(pkt);
- vlanh += offset;
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ return -1;
+
+ switch (pkt->tprot) {
+ case IPPROTO_UDP:
+ pkt->inneroff = thoff + sizeof(struct udphdr);
+ break;
+ case IPPROTO_TCP: {
+ struct tcphdr *th, _tcph;
- vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset);
- memcpy(dst_u8, vlanh, vlan_len);
+ th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph);
+ if (!th)
+ return -1;
- len -= vlan_len;
- if (!len)
+ pkt->inneroff = thoff + __tcp_hdrlen(th);
+ }
+ break;
+ case IPPROTO_GRE: {
+ u32 offset = sizeof(struct gre_base_hdr);
+ struct gre_base_hdr *gre, _gre;
+ __be16 version;
+
+ gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre);
+ if (!gre)
+ return -1;
+
+ version = gre->flags & GRE_VERSION;
+ switch (version) {
+ case GRE_VERSION_0:
+ if (gre->flags & GRE_ROUTING)
+ return -1;
+
+ if (gre->flags & GRE_CSUM) {
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
+ }
+ if (gre->flags & GRE_KEY)
+ offset += sizeof_field(struct gre_full_hdr, key);
+
+ if (gre->flags & GRE_SEQ)
+ offset += sizeof_field(struct gre_full_hdr, seq);
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->inneroff = thoff + offset;
+ }
+ break;
+ case IPPROTO_IPIP:
+ pkt->inneroff = thoff;
+ break;
+ default:
+ return -1;
+ }
+
+ pkt->flags |= NFT_PKTINFO_INNER;
+
+ return 0;
+}
+
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+{
+ if (!(pkt->flags & NFT_PKTINFO_INNER) &&
+ __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
+ return -1;
+
+ return pkt->inneroff;
+}
+
+static bool nft_payload_need_vlan_adjust(u32 offset, u32 len)
+{
+ unsigned int boundary = offset + len;
+
+ /* data past ether src/dst requested, copy needed */
+ if (boundary > offsetof(struct ethhdr, h_proto))
return true;
- dst_u8 += vlan_len;
- skip:
- return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0;
+ return false;
}
-static void nft_payload_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_payload_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_payload *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
u32 *dest = &regs->data[priv->dreg];
int offset;
- dest[priv->len / NFT_REG32_SIZE] = 0;
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
- if (!skb_mac_header_was_set(skb))
+ if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0)
goto err;
- if (skb_vlan_tag_present(skb)) {
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
if (!nft_payload_copy_vlan(dest, skb,
priv->offset, priv->len))
goto err;
@@ -97,12 +186,18 @@ static void nft_payload_eval(const struct nft_expr *expr,
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
offset += priv->offset;
@@ -117,10 +212,11 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 },
+ [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
static int nft_payload_init(const struct nft_ctx *ctx,
@@ -132,13 +228,14 @@ static int nft_payload_init(const struct nft_ctx *ctx,
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->dreg = nft_parse_register(tb[NFTA_PAYLOAD_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, priv->len);
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
}
-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload *priv = nft_expr_priv(expr);
@@ -153,12 +250,336 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct nft_payload *payload;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ payload = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->base != payload->base ||
+ priv->offset != payload->offset ||
+ priv->len != payload->len) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static bool nft_payload_offload_mask(struct nft_offload_reg *reg,
+ u32 priv_len, u32 field_len)
+{
+ unsigned int remainder, delta, k;
+ struct nft_data mask = {};
+ __be32 remainder_mask;
+
+ if (priv_len == field_len) {
+ memset(&reg->mask, 0xff, priv_len);
+ return true;
+ } else if (priv_len > field_len) {
+ return false;
+ }
+
+ memset(&mask, 0xff, field_len);
+ remainder = priv_len % sizeof(u32);
+ if (remainder) {
+ k = priv_len / sizeof(u32);
+ delta = field_len - priv_len;
+ remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1));
+ mask.data[k] = (__force u32)remainder_mask;
+ }
+
+ memcpy(&reg->mask, &mask, field_len);
+
+ return true;
+}
+
+static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct ethhdr, h_source):
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+ src, ETH_ALEN, reg);
+ break;
+ case offsetof(struct ethhdr, h_dest):
+ if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
+ dst, ETH_ALEN, reg);
+ break;
+ case offsetof(struct ethhdr, h_proto):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
+ n_proto, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+ vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
+ vlan_tci, sizeof(__be16), reg,
+ NFT_OFFLOAD_F_NETWORK2HOST);
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) +
+ sizeof(struct vlan_hdr):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan,
+ vlan_tpid, sizeof(__be16), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_ip(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct iphdr, saddr):
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src,
+ sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+ break;
+ case offsetof(struct iphdr, daddr):
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in_addr)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst,
+ sizeof(struct in_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+ break;
+ case offsetof(struct iphdr, protocol):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct ipv6hdr, saddr):
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src,
+ sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+ break;
+ case offsetof(struct ipv6hdr, daddr):
+ if (!nft_payload_offload_mask(reg, priv->len,
+ sizeof(struct in6_addr)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst,
+ sizeof(struct in6_addr), reg);
+ nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+ break;
+ case offsetof(struct ipv6hdr, nexthdr):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto,
+ sizeof(__u8), reg);
+ nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_nh(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ int err;
+
+ switch (ctx->dep.l3num) {
+ case htons(ETH_P_IP):
+ err = nft_payload_offload_ip(ctx, flow, priv);
+ break;
+ case htons(ETH_P_IPV6):
+ err = nft_payload_offload_ip6(ctx, flow, priv);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct tcphdr, source):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+ sizeof(__be16), reg);
+ break;
+ case offsetof(struct tcphdr, dest):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+ sizeof(__be16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_udp(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ struct nft_offload_reg *reg = &ctx->regs[priv->dreg];
+
+ switch (priv->offset) {
+ case offsetof(struct udphdr, source):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src,
+ sizeof(__be16), reg);
+ break;
+ case offsetof(struct udphdr, dest):
+ if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16)))
+ return -EOPNOTSUPP;
+
+ NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst,
+ sizeof(__be16), reg);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nft_payload_offload_th(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_payload *priv)
+{
+ int err;
+
+ switch (ctx->dep.protonum) {
+ case IPPROTO_TCP:
+ err = nft_payload_offload_tcp(ctx, flow, priv);
+ break;
+ case IPPROTO_UDP:
+ err = nft_payload_offload_udp(ctx, flow, priv);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static int nft_payload_offload(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow,
+ const struct nft_expr *expr)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ int err;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ err = nft_payload_offload_ll(ctx, flow, priv);
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ err = nft_payload_offload_nh(ctx, flow, priv);
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ err = nft_payload_offload_th(ctx, flow, priv);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+ return err;
+}
+
static const struct nft_expr_ops nft_payload_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
+ .offload = nft_payload_offload,
};
const struct nft_expr_ops nft_payload_fast_ops = {
@@ -167,11 +588,103 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.eval = nft_payload_eval,
.init = nft_payload_init,
.dump = nft_payload_dump,
+ .reduce = nft_payload_reduce,
+ .offload = nft_payload_offload,
+};
+
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ struct nft_inner_tun_ctx *tun_ctx)
+{
+ const struct nft_payload *priv = nft_expr_priv(expr);
+ const struct sk_buff *skb = pkt->skb;
+ u32 *dest = &regs->data[priv->dreg];
+ int offset;
+
+ if (priv->len % NFT_REG32_SIZE)
+ dest[priv->len / NFT_REG32_SIZE] = 0;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN))
+ goto err;
+
+ offset = tun_ctx->inner_tunoff;
+ break;
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL))
+ goto err;
+
+ offset = tun_ctx->inner_lloff;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH))
+ goto err;
+
+ offset = tun_ctx->inner_nhoff;
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+ goto err;
+
+ offset = tun_ctx->inner_thoff;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+ offset += priv->offset;
+
+ if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+ goto err;
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_inner_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload *priv = nft_expr_priv(expr);
+ u32 base;
+
+ if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] ||
+ !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG])
+ return -EINVAL;
+
+ base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ switch (base) {
+ case NFT_PAYLOAD_TUN_HEADER:
+ case NFT_PAYLOAD_LL_HEADER:
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->base = base;
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+ return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
+}
+
+static const struct nft_expr_ops nft_payload_inner_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+ .init = nft_payload_inner_init,
+ .dump = nft_payload_dump,
+ /* direct call to nft_payload_inner_eval(). */
};
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum);
if (*sum == 0)
*sum = CSUM_MANGLED_0;
}
@@ -191,14 +704,17 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
struct sk_buff *skb,
unsigned int *l4csum_offset)
{
+ if (pkt->fragoff)
+ return -1;
+
switch (pkt->tprot) {
case IPPROTO_TCP:
*l4csum_offset = offsetof(struct tcphdr, check);
break;
case IPPROTO_UDP:
- if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+ if (!nft_payload_udp_checksum(skb, nft_thoff(pkt)))
return -1;
- /* Fall through. */
+ fallthrough;
case IPPROTO_UDPLITE:
*l4csum_offset = offsetof(struct udphdr, check);
break;
@@ -209,7 +725,20 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
return -1;
}
- *l4csum_offset += pkt->xt.thoff;
+ *l4csum_offset += nft_thoff(pkt);
+ return 0;
+}
+
+static int nft_payload_csum_sctp(struct sk_buff *skb, int offset)
+{
+ struct sctphdr *sh;
+
+ if (skb_ensure_writable(skb, offset + sizeof(*sh)))
+ return -1;
+
+ sh = (struct sctphdr *)(skb->data + offset);
+ sh->checksum = sctp_compute_cksum(skb, offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
return 0;
}
@@ -243,7 +772,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
tsum));
}
- if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) ||
skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0)
return -1;
@@ -259,47 +788,125 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
return -1;
nft_csum_replace(&sum, fsum, tsum);
- if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) ||
skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
return -1;
return 0;
}
+struct nft_payload_set {
+ enum nft_payload_bases base:8;
+ u16 offset;
+ u8 len;
+ u8 sreg;
+ u8 csum_type;
+ u8 csum_offset;
+ u8 csum_flags;
+};
+
+/* This is not struct vlan_hdr. */
+struct nft_payload_vlan_hdr {
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
+};
+
+static bool
+nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
+ int *vlan_hlen)
+{
+ struct nft_payload_vlan_hdr *vlanh;
+ __be16 vlan_proto;
+ u16 vlan_tci;
+
+ if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) {
+ *vlan_hlen = VLAN_HLEN;
+ return true;
+ }
+
+ switch (offset) {
+ case offsetof(struct vlan_ethhdr, h_vlan_proto):
+ if (len == 2) {
+ vlan_proto = nft_reg_load_be16(src);
+ skb->vlan_proto = vlan_proto;
+ } else if (len == 4) {
+ vlanh = (struct nft_payload_vlan_hdr *)src;
+ __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto,
+ ntohs(vlanh->h_vlan_TCI));
+ } else {
+ return false;
+ }
+ break;
+ case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+ if (len != 2)
+ return false;
+
+ vlan_tci = ntohs(nft_reg_load_be16(src));
+ skb->vlan_tci = vlan_tci;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
- struct sk_buff *skb = pkt->skb;
const u32 *src = &regs->data[priv->sreg];
- int offset, csum_offset;
+ int offset, csum_offset, vlan_hlen = 0;
+ struct sk_buff *skb = pkt->skb;
__wsum fsum, tsum;
switch (priv->base) {
case NFT_PAYLOAD_LL_HEADER:
if (!skb_mac_header_was_set(skb))
goto err;
- offset = skb_mac_header(skb) - skb->data;
+
+ if (skb_vlan_tag_present(skb) &&
+ nft_payload_need_vlan_adjust(priv->offset, priv->len)) {
+ if (!nft_payload_set_vlan(src, skb,
+ priv->offset, priv->len,
+ &vlan_hlen))
+ goto err;
+
+ if (!vlan_hlen)
+ return;
+ }
+
+ offset = skb_mac_header(skb) - skb->data - vlan_hlen;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
- if (!pkt->tprot_set)
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
+ goto err;
+ offset = nft_thoff(pkt);
+ break;
+ case NFT_PAYLOAD_INNER_HEADER:
+ offset = nft_payload_inner_offset(pkt);
+ if (offset < 0)
goto err;
- offset = pkt->xt.thoff;
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
+ goto err;
}
csum_offset = offset + priv->csum_offset;
offset += priv->offset;
if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
- (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
+ ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER &&
+ priv->base != NFT_PAYLOAD_INNER_HEADER) ||
skb->ip_summed != CHECKSUM_PARTIAL)) {
+ if (offset + priv->len > skb->len)
+ goto err;
+
fsum = skb_checksum(skb, offset, priv->len, 0);
tsum = csum_partial(src, priv->len, 0);
@@ -312,10 +919,18 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
goto err;
}
- if (!skb_make_writable(skb, max(offset + priv->len, 0)) ||
+ if (skb_ensure_writable(skb, max(offset + priv->len, 0)) ||
skb_store_bits(skb, offset, src, priv->len) < 0)
goto err;
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
+ pkt->tprot == IPPROTO_SCTP &&
+ skb->ip_summed != CHECKSUM_PARTIAL) {
+ if (pkt->fragoff == 0 &&
+ nft_payload_csum_sctp(skb, nft_thoff(pkt)))
+ goto err;
+ }
+
return;
err:
regs->verdict.code = NFT_BREAK;
@@ -325,19 +940,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
struct nft_payload_set *priv = nft_expr_priv(expr);
+ int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
- priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
- priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return err;
+ priv->offset = offset;
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
- priv->csum_type =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
- if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
- priv->csum_offset =
- ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+ csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
+ if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX,
+ &csum_offset);
+ if (err < 0)
+ return err;
+
+ priv->csum_offset = csum_offset;
+ }
if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) {
u32 flags;
@@ -348,18 +972,28 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
priv->csum_flags = flags;
}
- switch (priv->csum_type) {
+ switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
case NFT_PAYLOAD_CSUM_INET:
break;
+ case NFT_PAYLOAD_CSUM_SCTP:
+ if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
+ return -EINVAL;
+
+ if (priv->csum_offset != offsetof(struct sctphdr, checksum))
+ return -EINVAL;
+ break;
default:
return -EOPNOTSUPP;
}
+ priv->csum_type = csum_type;
- return nft_validate_register_load(priv->sreg, priv->len);
+ return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg,
+ priv->len);
}
-static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_set_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_payload_set *priv = nft_expr_priv(expr);
@@ -378,12 +1012,32 @@ nla_put_failure:
return -1;
}
+static bool nft_payload_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_payload_ops &&
+ track->regs[i].selector->ops != &nft_payload_fast_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
static const struct nft_expr_ops nft_payload_set_ops = {
.type = &nft_payload_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
.eval = nft_payload_set_eval,
.init = nft_payload_set_init,
.dump = nft_payload_set_dump,
+ .reduce = nft_payload_set_reduce,
};
static const struct nft_expr_ops *
@@ -392,6 +1046,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
{
enum nft_payload_bases base;
unsigned int offset, len;
+ int err;
if (tb[NFTA_PAYLOAD_BASE] == NULL ||
tb[NFTA_PAYLOAD_OFFSET] == NULL ||
@@ -403,6 +1058,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
case NFT_PAYLOAD_LL_HEADER:
case NFT_PAYLOAD_NETWORK_HEADER:
case NFT_PAYLOAD_TRANSPORT_HEADER:
+ case NFT_PAYLOAD_INNER_HEADER:
break;
default:
return ERR_PTR(-EOPNOTSUPP);
@@ -417,11 +1073,16 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
- len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len);
+ if (err < 0)
+ return ERR_PTR(err);
if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) &&
- base != NFT_PAYLOAD_LL_HEADER)
+ base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER)
return &nft_payload_fast_ops;
else
return &nft_payload_ops;
@@ -430,6 +1091,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
struct nft_expr_type nft_payload_type __read_mostly = {
.name = "payload",
.select_ops = nft_payload_select_ops,
+ .inner_ops = &nft_payload_inner_ops,
.policy = nft_payload_policy,
.maxattr = NFTA_PAYLOAD_MAX,
.owner = THIS_MODULE,
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 98613658d4ac..344fe311878f 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Eric Leblond <eric@regit.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code partly funded by OISF
* (http://www.openinfosecfoundation.org/)
*/
@@ -22,10 +19,10 @@
static u32 jhash_initval __read_mostly;
struct nft_queue {
- enum nft_registers sreg_qnum:8;
- u16 queuenum;
- u16 queues_total;
- u16 flags;
+ u8 sreg_qnum;
+ u16 queuenum;
+ u16 queues_total;
+ u16 flags;
};
static void nft_queue_eval(const struct nft_expr *expr,
@@ -71,6 +68,30 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static int nft_queue_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ break;
+ case NFPROTO_NETDEV: /* lacks okfn */
+ fallthrough;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, supported_hooks);
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
@@ -114,8 +135,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
struct nft_queue *priv = nft_expr_priv(expr);
int err;
- priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]);
- err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32));
+ err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM],
+ &priv->sreg_qnum, sizeof(u32));
if (err < 0)
return err;
@@ -130,7 +151,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
return 0;
}
-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -146,7 +168,8 @@ nla_put_failure:
}
static int
-nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_queue_sreg_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_queue *priv = nft_expr_priv(expr);
@@ -167,6 +190,8 @@ static const struct nft_expr_ops nft_queue_ops = {
.eval = nft_queue_eval,
.init = nft_queue_init,
.dump = nft_queue_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops nft_queue_sreg_ops = {
@@ -175,6 +200,8 @@ static const struct nft_expr_ops nft_queue_sreg_ops = {
.eval = nft_queue_sreg_eval,
.init = nft_queue_sreg_init,
.dump = nft_queue_sreg_dump,
+ .validate = nft_queue_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nft_expr_ops *
@@ -219,3 +246,4 @@ module_exit(nft_queue_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Leblond <eric@regit.org>");
MODULE_ALIAS_NFT_EXPR("queue");
+MODULE_DESCRIPTION("Netfilter nftables queue module");
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 0ed124a93fcf..df0798da2329 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -16,15 +13,22 @@
#include <net/netfilter/nf_tables.h>
struct nft_quota {
- u64 quota;
+ atomic64_t quota;
unsigned long flags;
- atomic64_t consumed;
+ atomic64_t *consumed;
};
static inline bool nft_overquota(struct nft_quota *priv,
- const struct sk_buff *skb)
+ const struct sk_buff *skb,
+ bool *report)
{
- return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
+ u64 consumed = atomic64_add_return(skb->len, priv->consumed);
+ u64 quota = atomic64_read(&priv->quota);
+
+ if (report)
+ *report = consumed >= quota;
+
+ return consumed > quota;
}
static inline bool nft_quota_invert(struct nft_quota *priv)
@@ -36,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
+ if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
}
@@ -53,16 +57,16 @@ static void nft_quota_obj_eval(struct nft_object *obj,
const struct nft_pktinfo *pkt)
{
struct nft_quota *priv = nft_obj_data(obj);
- bool overquota;
+ bool overquota, report;
- overquota = nft_overquota(priv, pkt->skb);
+ overquota = nft_overquota(priv, pkt->skb, &report);
if (overquota ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
- if (overquota &&
+ if (report &&
!test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
- nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
- NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
+ nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0,
+ NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC);
}
static int nft_quota_do_init(const struct nlattr * const tb[],
@@ -92,13 +96,23 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
return -EOPNOTSUPP;
}
- priv->quota = quota;
+ priv->consumed = kmalloc(sizeof(*priv->consumed), GFP_KERNEL_ACCOUNT);
+ if (!priv->consumed)
+ return -ENOMEM;
+
+ atomic64_set(&priv->quota, quota);
priv->flags = flags;
- atomic64_set(&priv->consumed, consumed);
+ atomic64_set(priv->consumed, consumed);
return 0;
}
+static void nft_quota_do_destroy(const struct nft_ctx *ctx,
+ struct nft_quota *priv)
+{
+ kfree(priv->consumed);
+}
+
static int nft_quota_obj_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[],
struct nft_object *obj)
@@ -108,25 +122,38 @@ static int nft_quota_obj_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
+static void nft_quota_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_quota *newpriv = nft_obj_data(newobj);
+ struct nft_quota *priv = nft_obj_data(obj);
+ u64 newquota;
+
+ newquota = atomic64_read(&newpriv->quota);
+ atomic64_set(&priv->quota, newquota);
+ priv->flags = newpriv->flags;
+}
+
static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
bool reset)
{
- u64 consumed, consumed_cap;
+ u64 consumed, consumed_cap, quota;
u32 flags = priv->flags;
/* Since we inconditionally increment consumed quota for each packet
* that we see, don't go over the quota boundary in what we send to
* userspace.
*/
- consumed = atomic64_read(&priv->consumed);
- if (consumed >= priv->quota) {
- consumed_cap = priv->quota;
+ consumed = atomic64_read(priv->consumed);
+ quota = atomic64_read(&priv->quota);
+ if (consumed >= quota) {
+ consumed_cap = quota;
flags |= NFT_QUOTA_F_DEPLETED;
} else {
consumed_cap = consumed;
}
- if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
+ if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(quota),
NFTA_QUOTA_PAD) ||
nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed_cap),
NFTA_QUOTA_PAD) ||
@@ -134,7 +161,7 @@ static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
goto nla_put_failure;
if (reset) {
- atomic64_sub(consumed, &priv->consumed);
+ atomic64_sub(consumed, priv->consumed);
clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags);
}
return 0;
@@ -151,13 +178,23 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
return nft_quota_do_dump(skb, priv, reset);
}
+static void nft_quota_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_quota *priv = nft_obj_data(obj);
+
+ return nft_quota_do_destroy(ctx, priv);
+}
+
static struct nft_object_type nft_quota_obj_type;
static const struct nft_object_ops nft_quota_obj_ops = {
.type = &nft_quota_obj_type,
.size = sizeof(struct nft_quota),
.init = nft_quota_obj_init,
+ .destroy = nft_quota_obj_destroy,
.eval = nft_quota_obj_eval,
.dump = nft_quota_obj_dump,
+ .update = nft_quota_obj_update,
};
static struct nft_object_type nft_quota_obj_type __read_mostly = {
@@ -186,11 +223,37 @@ static int nft_quota_init(const struct nft_ctx *ctx,
return nft_quota_do_init(tb, priv);
}
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_quota *priv = nft_expr_priv(expr);
+
+ return nft_quota_do_dump(skb, priv, reset);
+}
+
+static void nft_quota_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
struct nft_quota *priv = nft_expr_priv(expr);
- return nft_quota_do_dump(skb, priv, false);
+ return nft_quota_do_destroy(ctx, priv);
+}
+
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
+{
+ struct nft_quota *priv_dst = nft_expr_priv(dst);
+ struct nft_quota *priv_src = nft_expr_priv(src);
+
+ priv_dst->quota = priv_src->quota;
+ priv_dst->flags = priv_src->flags;
+
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp);
+ if (!priv_dst->consumed)
+ return -ENOMEM;
+
+ *priv_dst->consumed = *priv_src->consumed;
+
+ return 0;
}
static struct nft_expr_type nft_quota_type;
@@ -199,7 +262,10 @@ static const struct nft_expr_ops nft_quota_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_quota)),
.eval = nft_quota_eval,
.init = nft_quota_init,
+ .destroy = nft_quota_destroy,
+ .clone = nft_quota_clone,
.dump = nft_quota_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_quota_type __read_mostly = {
@@ -242,3 +308,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("quota");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
+MODULE_DESCRIPTION("Netfilter nftables quota module");
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index cedb96c3619f..ea382f7bbd78 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -18,14 +15,13 @@
struct nft_range_expr {
struct nft_data data_from;
struct nft_data data_to;
- enum nft_registers sreg:8;
+ u8 sreg;
u8 len;
enum nft_range_ops op:8;
};
-static void nft_range_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_range_eval(const struct nft_expr *expr,
+ struct nft_regs *regs, const struct nft_pktinfo *pkt)
{
const struct nft_range_expr *priv = nft_expr_priv(expr);
int d1, d2;
@@ -46,7 +42,7 @@ static void nft_range_eval(const struct nft_expr *expr,
static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = {
[NFTA_RANGE_SREG] = { .type = NLA_U32 },
- [NFTA_RANGE_OP] = { .type = NLA_U32 },
+ [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED },
[NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED },
};
@@ -55,7 +51,14 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
const struct nlattr * const tb[])
{
struct nft_range_expr *priv = nft_expr_priv(expr);
- struct nft_data_desc desc_from, desc_to;
+ struct nft_data_desc desc_from = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_from),
+ };
+ struct nft_data_desc desc_to = {
+ .type = NFT_DATA_VALUE,
+ .size = sizeof(priv->data_to),
+ };
int err;
u32 op;
@@ -65,13 +68,13 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
!tb[NFTA_RANGE_TO_DATA])
return -EINVAL;
- err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
- &desc_from, tb[NFTA_RANGE_FROM_DATA]);
+ err = nft_data_init(NULL, &priv->data_from, &desc_from,
+ tb[NFTA_RANGE_FROM_DATA]);
if (err < 0)
return err;
- err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to),
- &desc_to, tb[NFTA_RANGE_TO_DATA]);
+ err = nft_data_init(NULL, &priv->data_to, &desc_to,
+ tb[NFTA_RANGE_TO_DATA]);
if (err < 0)
goto err1;
@@ -80,8 +83,8 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
goto err2;
}
- priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]);
- err = nft_validate_register_load(priv->sreg, desc_from.len);
+ err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg,
+ desc_from.len);
if (err < 0)
goto err2;
@@ -108,7 +111,8 @@ err1:
return err;
}
-static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_range_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_range_expr *priv = nft_expr_priv(expr);
@@ -134,6 +138,7 @@ static const struct nft_expr_ops nft_range_ops = {
.eval = nft_range_eval,
.init = nft_range_init,
.dump = nft_range_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_range_type __read_mostly = {
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index c64cbe78dee7..95eedad85c83 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -13,19 +10,24 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_redirect.h>
#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_redir.h>
-const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
+struct nft_redir {
+ u8 sreg_proto_min;
+ u8 sreg_proto_max;
+ u16 flags;
+};
+
+static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = {
[NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 },
[NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 },
- [NFTA_REDIR_FLAGS] = { .type = NLA_U32 },
+ [NFTA_REDIR_FLAGS] =
+ NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK),
};
-EXPORT_SYMBOL_GPL(nft_redir_policy);
-int nft_redir_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_redir_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
int err;
@@ -37,49 +39,43 @@ int nft_redir_validate(const struct nft_ctx *ctx,
(1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_OUT));
}
-EXPORT_SYMBOL_GPL(nft_redir_validate);
-int nft_redir_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_redir_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
{
struct nft_redir *priv = nft_expr_priv(expr);
unsigned int plen;
int err;
- plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
+ plen = sizeof_field(struct nf_nat_range, min_proto.all);
if (tb[NFTA_REDIR_REG_PROTO_MIN]) {
- priv->sreg_proto_min =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MIN]);
-
- err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN],
+ &priv->sreg_proto_min, plen);
if (err < 0)
return err;
if (tb[NFTA_REDIR_REG_PROTO_MAX]) {
- priv->sreg_proto_max =
- nft_parse_register(tb[NFTA_REDIR_REG_PROTO_MAX]);
-
- err = nft_validate_register_load(priv->sreg_proto_max,
- plen);
+ err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX],
+ &priv->sreg_proto_max,
+ plen);
if (err < 0)
return err;
} else {
priv->sreg_proto_max = priv->sreg_proto_min;
}
+
+ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
}
- if (tb[NFTA_REDIR_FLAGS]) {
+ if (tb[NFTA_REDIR_FLAGS])
priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
- }
return nf_ct_netns_get(ctx->net, ctx->family);
}
-EXPORT_SYMBOL_GPL(nft_redir_init);
-int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_redir *priv = nft_expr_priv(expr);
@@ -101,7 +97,174 @@ int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
nla_put_failure:
return -1;
}
-EXPORT_SYMBOL_GPL(nft_redir_dump);
+
+static void nft_redir_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_redir *priv = nft_expr_priv(expr);
+ struct nf_nat_range2 range;
+
+ memset(&range, 0, sizeof(range));
+ range.flags = priv->flags;
+ if (priv->sreg_proto_min) {
+ range.min_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+ range.max_proto.all = (__force __be16)
+ nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+ }
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#ifdef CONFIG_NF_TABLES_IPV6
+ case NFPROTO_IPV6:
+ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+ nft_hook(pkt));
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
+static struct nft_expr_type nft_redir_ipv4_type;
+static const struct nft_expr_ops nft_redir_ipv4_ops = {
+ .type = &nft_redir_ipv4_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_eval,
+ .init = nft_redir_init,
+ .destroy = nft_redir_ipv4_destroy,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
+ .family = NFPROTO_IPV4,
+ .name = "redir",
+ .ops = &nft_redir_ipv4_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static void
+nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
+static struct nft_expr_type nft_redir_ipv6_type;
+static const struct nft_expr_ops nft_redir_ipv6_ops = {
+ .type = &nft_redir_ipv6_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_eval,
+ .init = nft_redir_init,
+ .destroy = nft_redir_ipv6_destroy,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
+ .family = NFPROTO_IPV6,
+ .name = "redir",
+ .ops = &nft_redir_ipv6_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+#endif
+
+#ifdef CONFIG_NF_TABLES_INET
+static void
+nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+ nf_ct_netns_put(ctx->net, NFPROTO_INET);
+}
+
+static struct nft_expr_type nft_redir_inet_type;
+static const struct nft_expr_ops nft_redir_inet_ops = {
+ .type = &nft_redir_inet_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+ .eval = nft_redir_eval,
+ .init = nft_redir_init,
+ .destroy = nft_redir_inet_destroy,
+ .dump = nft_redir_dump,
+ .validate = nft_redir_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_redir_inet_type __read_mostly = {
+ .family = NFPROTO_INET,
+ .name = "redir",
+ .ops = &nft_redir_inet_ops,
+ .policy = nft_redir_policy,
+ .maxattr = NFTA_REDIR_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_redir_module_init_inet(void)
+{
+ return nft_register_expr(&nft_redir_inet_type);
+}
+#else
+static inline int nft_redir_module_init_inet(void) { return 0; }
+#endif
+
+static int __init nft_redir_module_init(void)
+{
+ int ret = nft_register_expr(&nft_redir_ipv4_type);
+
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_NF_TABLES_IPV6
+ ret = nft_register_expr(&nft_redir_ipv6_type);
+ if (ret) {
+ nft_unregister_expr(&nft_redir_ipv4_type);
+ return ret;
+ }
+#endif
+
+ ret = nft_redir_module_init_inet();
+ if (ret < 0) {
+ nft_unregister_expr(&nft_redir_ipv4_type);
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_expr(&nft_redir_ipv6_type);
+#endif
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit nft_redir_module_exit(void)
+{
+ nft_unregister_expr(&nft_redir_ipv4_type);
+#ifdef CONFIG_NF_TABLES_IPV6
+ nft_unregister_expr(&nft_redir_ipv6_type);
+#endif
+#ifdef CONFIG_NF_TABLES_INET
+ nft_unregister_expr(&nft_redir_inet_type);
+#endif
+}
+
+module_init(nft_redir_module_init);
+module_exit(nft_redir_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
+MODULE_ALIAS_NFT_EXPR("redir");
+MODULE_DESCRIPTION("Netfilter nftables redirect support");
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index b48e58cceeb7..196a92c7ea09 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2013 Eric Leblond <eric@regit.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -21,19 +18,19 @@
#include <linux/icmpv6.h>
const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = {
- [NFTA_REJECT_TYPE] = { .type = NLA_U32 },
+ [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 },
};
EXPORT_SYMBOL_GPL(nft_reject_policy);
int nft_reject_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain,
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_FORWARD) |
- (1 << NF_INET_LOCAL_OUT));
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING));
}
EXPORT_SYMBOL_GPL(nft_reject_validate);
@@ -42,6 +39,7 @@ int nft_reject_init(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
{
struct nft_reject *priv = nft_expr_priv(expr);
+ int icmp_code;
if (tb[NFTA_REJECT_TYPE] == NULL)
return -EINVAL;
@@ -49,9 +47,17 @@ int nft_reject_init(const struct nft_ctx *ctx,
priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
return -EINVAL;
- priv->icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+
+ icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
+ if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
+ icmp_code > NFT_REJECT_ICMPX_MAX)
+ return -EINVAL;
+
+ priv->icmp_code = icmp_code;
+ break;
case NFT_REJECT_TCP_RST:
break;
default:
@@ -62,7 +68,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_reject_init);
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
const struct nft_reject *priv = nft_expr_priv(expr);
@@ -71,6 +78,7 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
+ case NFT_REJECT_ICMPX_UNREACH:
if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
goto nla_put_failure;
break;
@@ -122,3 +130,4 @@ EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 5a7fb5ff867d..49020e67304a 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2014 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -31,7 +28,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach(pkt->skb,
@@ -47,7 +45,8 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt),
+ pkt->skb, nft_hook(pkt));
break;
case NFT_REJECT_ICMPX_UNREACH:
nf_send_unreach6(nft_net(pkt), pkt->skb,
@@ -61,60 +60,15 @@ static void nft_reject_inet_eval(const struct nft_expr *expr,
regs->verdict.code = NF_DROP;
}
-static int nft_reject_inet_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static int nft_reject_inet_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_inet_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_INGRESS));
}
static struct nft_expr_type nft_reject_inet_type;
@@ -122,9 +76,10 @@ static const struct nft_expr_ops nft_reject_inet_ops = {
.type = &nft_reject_inet_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_inet_eval,
- .init = nft_reject_inet_init,
- .dump = nft_reject_inet_dump,
- .validate = nft_reject_validate,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_inet_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_inet_type __read_mostly = {
@@ -152,3 +107,4 @@ module_exit(nft_reject_inet_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(1, "reject");
+MODULE_DESCRIPTION("Netfilter nftables reject inet support");
diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c
new file mode 100644
index 000000000000..2558ce1505d9
--- /dev/null
+++ b/net/netfilter/nft_reject_netdev.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020 Laura Garcia Liebana <nevola@gmail.com>
+ * Copyright (c) 2020 Jose M. Guisado <guigom@riseup.net>
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_reject.h>
+#include <net/netfilter/ipv4/nf_reject.h>
+#include <net/netfilter/ipv6/nf_reject.h>
+
+static void nft_reject_queue_xmit(struct sk_buff *nskb, struct sk_buff *oldskb)
+{
+ dev_hard_header(nskb, nskb->dev, ntohs(oldskb->protocol),
+ eth_hdr(oldskb)->h_source, eth_hdr(oldskb)->h_dest,
+ nskb->len);
+ dev_queue_xmit(nskb);
+}
+
+static void nft_reject_netdev_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_send_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, dev, hook);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+
+static void nft_reject_netdev_send_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, dev, hook, code);
+ if (!nskb)
+ return;
+
+ nft_reject_queue_xmit(nskb, oldskb);
+}
+
+static void nft_reject_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct ethhdr *eth = eth_hdr(pkt->skb);
+ struct nft_reject *priv = nft_expr_priv(expr);
+ const unsigned char *dest = eth->h_dest;
+
+ if (is_broadcast_ether_addr(dest) ||
+ is_multicast_ether_addr(dest))
+ goto out;
+
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v4_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v4_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmp_code(priv->icmp_code));
+ break;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ switch (priv->type) {
+ case NFT_REJECT_ICMP_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ priv->icmp_code);
+ break;
+ case NFT_REJECT_TCP_RST:
+ nft_reject_netdev_send_v6_tcp_reset(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt));
+ break;
+ case NFT_REJECT_ICMPX_UNREACH:
+ nft_reject_netdev_send_v6_unreach(nft_net(pkt), pkt->skb,
+ nft_in(pkt),
+ nft_hook(pkt),
+ nft_reject_icmpv6_code(priv->icmp_code));
+ break;
+ }
+ break;
+ default:
+ /* No explicit way to reject this protocol, drop it. */
+ break;
+ }
+out:
+ regs->verdict.code = NF_DROP;
+}
+
+static int nft_reject_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
+static struct nft_expr_type nft_reject_netdev_type;
+static const struct nft_expr_ops nft_reject_netdev_ops = {
+ .type = &nft_reject_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
+ .eval = nft_reject_netdev_eval,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
+ .validate = nft_reject_netdev_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_reject_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "reject",
+ .ops = &nft_reject_netdev_ops,
+ .policy = nft_reject_policy,
+ .maxattr = NFTA_REJECT_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_reject_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_reject_netdev_type);
+}
+
+static void __exit nft_reject_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_reject_netdev_type);
+}
+
+module_init(nft_reject_netdev_module_init);
+module_exit(nft_reject_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laura Garcia Liebana <nevola@gmail.com>");
+MODULE_AUTHOR("Jose M. Guisado <guigom@riseup.net>");
+MODULE_DESCRIPTION("Reject packets from netdev via nftables");
+MODULE_ALIAS_NFT_AF_EXPR(5, "reject");
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index f35fa33913ae..dc50b9a5bd68 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016 Anders K. Pedersen <akp@cohaesio.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -18,7 +15,7 @@
struct nft_rt {
enum nft_rt_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
};
static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst)
@@ -53,9 +50,9 @@ static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skb
return mtu - minlen;
}
-static void nft_rt_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_rt_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_rt *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
@@ -76,14 +73,14 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
if (nft_pf(pkt) != NFPROTO_IPV4)
goto err;
- *dest = (__force u32)rt_nexthop((const struct rtable *)dst,
+ *dest = (__force u32)rt_nexthop(dst_rtable(dst),
ip_hdr(skb)->daddr);
break;
case NFT_RT_NEXTHOP6:
if (nft_pf(pkt) != NFPROTO_IPV6)
goto err;
- memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+ memcpy(dest, rt6_nexthop(dst_rt6_info(dst),
&ipv6_hdr(skb)->daddr),
sizeof(struct in6_addr));
break;
@@ -107,7 +104,7 @@ err:
static const struct nla_policy nft_rt_policy[NFTA_RT_MAX + 1] = {
[NFTA_RT_DREG] = { .type = NLA_U32 },
- [NFTA_RT_KEY] = { .type = NLA_U32 },
+ [NFTA_RT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_rt_get_init(const struct nft_ctx *ctx,
@@ -144,13 +141,12 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_RT_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ return nft_parse_register_store(ctx, tb[NFTA_RT_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_rt_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_rt *priv = nft_expr_priv(expr);
@@ -164,12 +160,16 @@ nla_put_failure:
return -1;
}
-static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_rt *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->key) {
case NFT_RT_NEXTHOP4:
case NFT_RT_NEXTHOP6:
@@ -195,6 +195,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
.init = nft_rt_get_init,
.dump = nft_rt_get_dump,
.validate = nft_rt_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
struct nft_expr_type nft_rt_type __read_mostly = {
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index f866bd41e5d2..8d3f040a904a 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -13,9 +10,10 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
struct nft_bitmap_elem {
+ struct nft_elem_priv priv;
struct list_head head;
struct nft_set_ext ext;
};
@@ -24,7 +22,7 @@ struct nft_bitmap_elem {
* the element state in the current and the future generation.
*
* An element can be in three states. The generation cursor is represented using
- * the ^ character, note that this cursor shifts on every succesful transaction.
+ * the ^ character, note that this cursor shifts on every successful transaction.
* If no transaction is going on, we observe all elements are in the following
* state:
*
@@ -42,7 +40,7 @@ struct nft_bitmap_elem {
* 10 = this element is active in the current generation and it becomes inactive
* ^ in the next one. This happens when the element is deactivated but commit
* path has not yet been executed yet, so removal is still pending. On
- * transation abortion, the next generation bit is reset to go back to
+ * transaction abortion, the next generation bit is reset to go back to
* restore its previous state.
*/
struct nft_bitmap {
@@ -76,26 +74,34 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
return (bitmap[idx] & (0x3 << off)) & (genmask << off);
}
-static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
const struct nft_bitmap *priv = nft_set_priv(set);
+ static const struct nft_set_ext found;
u8 genmask = nft_genmask_cur(net);
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
- return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+ if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+ return &found;
+
+ return NULL;
}
static struct nft_bitmap_elem *
-nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
+nft_bitmap_elem_find(const struct net *net,
+ const struct nft_set *set, struct nft_bitmap_elem *this,
u8 genmask)
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(net)->commit_mutex)) {
if (memcmp(nft_set_ext_key(&be->ext),
nft_set_ext_key(&this->ext), set->klen) ||
!nft_set_elem_active(&be->ext, genmask))
@@ -106,8 +112,9 @@ nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this,
return NULL;
}
-static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_bitmap_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
const struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -118,23 +125,23 @@ static void *nft_bitmap_get(const struct net *net, const struct nft_set *set,
!nft_set_elem_active(&be->ext, genmask))
continue;
- return be;
+ return &be->priv;
}
return ERR_PTR(-ENOENT);
}
static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *new = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
- be = nft_bitmap_elem_find(set, new, genmask);
+ be = nft_bitmap_elem_find(net, set, new, genmask);
if (be) {
- *ext = &be->ext;
+ *elem_priv = &be->priv;
return -EEXIST;
}
@@ -146,12 +153,11 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set,
return 0;
}
-static void nft_bitmap_remove(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static void nft_bitmap_remove(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
@@ -163,47 +169,46 @@ static void nft_bitmap_remove(const struct net *net,
static void nft_bitmap_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *be = elem->priv;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 11 state. */
priv->bitmap[idx] |= (genmask << off);
- nft_set_elem_change_active(net, set, &be->ext);
+ nft_clear(net, &be->ext);
}
-static bool nft_bitmap_flush(const struct net *net,
- const struct nft_set *set, void *_be)
+static void nft_bitmap_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv);
struct nft_bitmap *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
- struct nft_bitmap_elem *be = _be;
u32 idx, off;
nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off);
/* Enter 10 state, similar to deactivation. */
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
-
- return true;
}
-static void *nft_bitmap_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_bitmap_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be;
struct nft_bitmap *priv = nft_set_priv(set);
- struct nft_bitmap_elem *this = elem->priv, *be;
u8 genmask = nft_genmask_next(net);
u32 idx, off;
nft_bitmap_location(set, elem->key.val.data, &idx, &off);
- be = nft_bitmap_elem_find(set, this, genmask);
+ be = nft_bitmap_elem_find(net, set, this, genmask);
if (!be)
return NULL;
@@ -211,7 +216,7 @@ static void *nft_bitmap_deactivate(const struct net *net,
priv->bitmap[idx] &= ~(genmask << off);
nft_set_elem_change_active(net, set, &be->ext);
- return be;
+ return &be->priv;
}
static void nft_bitmap_walk(const struct nft_ctx *ctx,
@@ -220,17 +225,13 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
{
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- struct nft_set_elem elem;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&be->ext, iter->genmask))
- goto cont;
- elem.priv = be;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &be->priv);
if (iter->err < 0)
return;
@@ -262,24 +263,27 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[],
}
static int nft_bitmap_init(const struct nft_set *set,
- const struct nft_set_desc *desc,
- const struct nlattr * const nla[])
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
{
struct nft_bitmap *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0);
+
INIT_LIST_HEAD(&priv->list);
priv->bitmap_size = nft_bitmap_size(set->klen);
return 0;
}
-static void nft_bitmap_destroy(const struct nft_set *set)
+static void nft_bitmap_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be, *n;
list_for_each_entry_safe(be, n, &priv->list, head)
- nft_set_elem_destroy(set, be, true);
+ nf_tables_set_elem_destroy(ctx, set, &be->priv);
}
static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
@@ -288,6 +292,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
if (desc->klen > 2)
return false;
+ else if (desc->expr)
+ return false;
est->size = nft_bitmap_total_size(desc->klen);
est->lookup = NFT_SET_CLASS_O_1;
@@ -296,8 +302,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_bitmap_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_bitmap_type = {
.ops = {
.privsize = nft_bitmap_privsize,
.elemsize = offsetof(struct nft_bitmap_elem, ext),
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 339a9dd1c832..ba01ce75d6de 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -19,7 +16,7 @@
#include <linux/rhashtable.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
/* We target a hash table size of 4, element hint is 75% of final size */
#define NFT_RHASH_ELEMENT_HINT 3
@@ -27,10 +24,14 @@
struct nft_rhash {
struct rhashtable ht;
struct delayed_work gc_work;
+ u32 wq_gc_seq;
};
struct nft_rhash_elem {
+ struct nft_elem_priv priv;
struct rhash_head node;
+ struct llist_node walk_node;
+ u32 wq_gc_seq;
struct nft_set_ext ext;
};
@@ -38,6 +39,7 @@ struct nft_rhash_cmp_arg {
const struct nft_set *set;
const u32 *key;
u8 genmask;
+ u64 tstamp;
};
static inline u32 nft_rhash_key(const void *data, u32 len, u32 seed)
@@ -62,7 +64,9 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
- if (nft_set_elem_expired(&he->ext))
+ if (nft_set_elem_is_dead(&he->ext))
+ return 1;
+ if (__nft_set_elem_expired(&he->ext, x->tstamp))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
return 1;
@@ -77,8 +81,10 @@ static const struct rhashtable_params nft_rhash_params = {
.automatic_shrinking = true,
};
-static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rhash *priv = nft_set_priv(set);
const struct nft_rhash_elem *he;
@@ -86,17 +92,19 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- *ext = &he->ext;
+ return &he->ext;
- return !!he;
+ return NULL;
}
-static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rhash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -104,39 +112,40 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set,
.genmask = nft_genmask_cur(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- return he;
+ return &he->priv;
return ERR_PTR(-ENOENT);
}
-static bool nft_rhash_update(struct nft_set *set, const u32 *key,
- void *(*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *regs),
- const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+nft_rhash_update(struct nft_set *set, const u32 *key,
+ const struct nft_expr *expr, struct nft_regs *regs)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he, *prev;
+ struct nft_elem_priv *elem_priv;
struct nft_rhash_cmp_arg arg = {
.genmask = NFT_GENMASK_ANY,
.set = set,
.key = key,
+ .tstamp = get_jiffies_64(),
};
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
goto out;
- he = new(set, expr, regs);
- if (he == NULL)
+ elem_priv = nft_dynset_new(set, expr, regs);
+ if (!elem_priv)
goto err1;
+ he = nft_elem_priv_cast(elem_priv);
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -144,69 +153,67 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
/* Another cpu may race to insert the element with the same key */
if (prev) {
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
+ atomic_dec(&set->nelems);
he = prev;
}
out:
- *ext = &he->ext;
- return true;
+ return &he->ext;
err2:
- nft_set_elem_destroy(set, he, true);
+ nft_set_elem_destroy(set, &he->priv, true);
+ atomic_dec(&set->nelems);
err1:
- return false;
+ return NULL;
}
static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem->priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
struct nft_rhash_cmp_arg arg = {
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
struct nft_rhash_elem *prev;
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
return PTR_ERR(prev);
if (prev) {
- *ext = &prev->ext;
+ *elem_priv = &prev->priv;
return -EEXIST;
}
return 0;
}
static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = elem->priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_rhash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rhash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rhash_elem *he = priv;
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext)) {
- nft_set_elem_change_active(net, set, &he->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &he->ext);
}
-static void *nft_rhash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rhash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he;
@@ -214,36 +221,55 @@ static void *nft_rhash_deactivate(const struct net *net,
.genmask = nft_genmask_next(net),
.set = set,
.key = elem->key.val.data,
+ .tstamp = nft_net_tstamp(net),
};
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
- if (he != NULL &&
- !nft_rhash_flush(net, set, he))
- he = NULL;
+ if (he)
+ nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
- return he;
+ return &he->priv;
}
static void nft_rhash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rhash_elem *he = nft_elem_priv_cast(elem_priv);
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he = elem->priv;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
}
-static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static bool nft_rhash_delete(const struct nft_set *set,
+ const u32 *key)
{
struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_cmp_arg arg = {
+ .genmask = NFT_GENMASK_ANY,
+ .set = set,
+ .key = key,
+ };
struct nft_rhash_elem *he;
+
+ he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
+ if (he == NULL)
+ return false;
+
+ nft_set_elem_dead(&he->ext);
+
+ return true;
+}
+
+static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
struct rhashtable_iter hti;
- struct nft_set_elem elem;
+ struct nft_rhash_elem *he;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
@@ -260,14 +286,8 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (iter->count < iter->skip)
goto cont;
- if (nft_set_elem_expired(&he->ext))
- goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
- elem.priv = he;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
break;
@@ -278,51 +298,199 @@ cont:
rhashtable_walk_exit(&hti);
}
+static void nft_rhash_walk_update(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_elem *he, *tmp;
+ struct llist_node *first_node;
+ struct rhashtable_iter hti;
+ LLIST_HEAD(walk_list);
+
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (set->in_update_walk) {
+ /* This can happen with bogus rulesets during ruleset validation
+ * when a verdict map causes a jump back to the same map.
+ *
+ * Without this extra check the walk_next loop below will see
+ * elems on the callers walk_list and skip (not validate) them.
+ */
+ iter->err = -EMLINK;
+ return;
+ }
+
+ /* walk happens under RCU.
+ *
+ * We create a snapshot list so ->iter callback can sleep.
+ * commit_mutex is held, elements can ...
+ * .. be added in parallel from dataplane (dynset)
+ * .. be marked as dead in parallel from dataplane (dynset).
+ * .. be queued for removal in parallel (gc timeout).
+ * .. not be freed: transaction mutex is held.
+ */
+ rhashtable_walk_enter(&priv->ht, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
+ }
+
+ continue;
+ }
+
+ /* rhashtable resized during walk, skip */
+ if (llist_on_list(&he->walk_node))
+ continue;
+
+ llist_add(&he->walk_node, &walk_list);
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ first_node = __llist_del_all(&walk_list);
+ set->in_update_walk = true;
+ llist_for_each_entry_safe(he, tmp, first_node, walk_node) {
+ if (iter->err == 0) {
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
+ if (iter->err == 0)
+ iter->count++;
+ }
+
+ /* all entries must be cleared again, else next ->walk iteration
+ * will skip entries.
+ */
+ init_llist_node(&he->walk_node);
+ }
+ set->in_update_walk = false;
+}
+
+static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ /* only relevant for netlink dumps which use READ type */
+ WARN_ON_ONCE(iter->skip != 0);
+
+ nft_rhash_walk_update(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ nft_rhash_walk_ro(ctx, set, iter);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
+ struct nft_set_ext *ext)
+{
+ struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
+ struct nft_expr *expr;
+ u32 size;
+
+ nft_setelem_expr_foreach(expr, elem_expr, size) {
+ if (expr->ops->gc &&
+ expr->ops->gc(read_pnet(&set->net), expr) &&
+ set->flags & NFT_SET_EVAL)
+ return true;
+ }
+
+ return false;
+}
+
static void nft_rhash_gc(struct work_struct *work)
{
+ struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
- struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
+ struct nft_trans_gc *gc;
+ struct net *net;
+ u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ if (nft_set_gc_is_pending(set))
+ goto done;
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
+
+ /* Elements never collected use a zero gc worker sequence number. */
+ if (unlikely(++priv->wq_gc_seq == 0))
+ priv->wq_gc_seq++;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- break;
- continue;
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
}
- if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(&he->ext);
-
- if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
- goto gc;
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
}
- if (!nft_set_elem_expired(&he->ext))
- continue;
-gc:
- if (nft_set_elem_mark_busy(&he->ext))
+
+ /* rhashtable walk is unstable, already seen in this gc run?
+ * Then, skip this element. In case of (unlikely) sequence
+ * wraparound and stale element wq_gc_seq, next gc run will
+ * just find this expired element.
+ */
+ if (he->wq_gc_seq == priv->wq_gc_seq)
continue;
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- break;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
+ if (nft_set_elem_is_dead(&he->ext))
+ goto dead_elem;
+
+ if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
+ nft_rhash_expr_needs_gc_run(set, &he->ext))
+ goto needs_gc_run;
+
+ if (!nft_set_elem_expired(&he->ext))
+ continue;
+needs_gc_run:
+ nft_set_elem_dead(&he->ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
+
+ /* annotate gc sequence for this attempt. */
+ he->wq_gc_seq = priv->wq_gc_seq;
+ nft_trans_gc_elem_add(gc, he);
}
+
+ gc = nft_trans_gc_catchall_async(gc, gc_seq);
+
+try_later:
+ /* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
- nft_set_gc_batch_complete(gcb);
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@@ -349,6 +517,8 @@ static int nft_rhash_init(const struct nft_set *set,
struct rhashtable_params params = nft_rhash_params;
int err;
+ BUILD_BUG_ON(offsetof(struct nft_rhash_elem, priv) != 0);
+
params.nelem_hint = desc->size ?: NFT_RHASH_ELEMENT_HINT;
params.key_len = set->klen;
@@ -357,30 +527,50 @@ static int nft_rhash_init(const struct nft_set *set,
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
+ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
}
+struct nft_rhash_ctx {
+ const struct nft_ctx ctx;
+ const struct nft_set *set;
+};
+
static void nft_rhash_elem_destroy(void *ptr, void *arg)
{
- nft_set_elem_destroy(arg, ptr, true);
+ struct nft_rhash_ctx *rhash_ctx = arg;
+ struct nft_rhash_elem *he = ptr;
+
+ nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, &he->priv);
}
-static void nft_rhash_destroy(const struct nft_set *set)
+static void nft_rhash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_ctx rhash_ctx = {
+ .ctx = *ctx,
+ .set = set,
+ };
cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
- (void *)set);
+ (void *)&rhash_ctx);
}
+/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
+#define NFT_MAX_BUCKETS (1U << 31)
+
static u32 nft_hash_buckets(u32 size)
{
- return roundup_pow_of_two(size * 4 / 3);
+ u64 val = div_u64((u64)size * 4, 3);
+
+ if (val >= NFT_MAX_BUCKETS)
+ return NFT_MAX_BUCKETS;
+
+ return roundup_pow_of_two(val);
}
static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features,
@@ -400,12 +590,15 @@ struct nft_hash {
};
struct nft_hash_elem {
+ struct nft_elem_priv priv;
struct hlist_node node;
struct nft_set_ext ext;
};
-static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -416,16 +609,15 @@ static bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
-static void *nft_hash_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_hash_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -437,59 +629,65 @@ static void *nft_hash_get(const struct net *net, const struct nft_set *set,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
nft_set_elem_active(&he->ext, genmask))
- return he;
+ return &he->priv;
}
return ERR_PTR(-ENOENT);
}
-/* nft_hash_select_ops() makes sure key size can be either 2 or 4 bytes . */
-static inline u32 nft_hash_key(const u32 *key, u32 klen)
-{
- if (klen == 4)
- return *key;
-
- return *(u16 *)key;
-}
-
-static bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
const struct nft_hash_elem *he;
u32 hash, k1, k2;
- k1 = nft_hash_key(key, set->klen);
+ k1 = *key;
hash = jhash_1word(k1, priv->seed);
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
- k2 = nft_hash_key(nft_set_ext_key(&he->ext)->data, set->klen);
+ k2 = *(u32 *)nft_set_ext_key(&he->ext)->data;
if (k1 == k2 &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
+}
+
+static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
+ const struct nft_set_ext *ext)
+{
+ const struct nft_data *key = nft_set_ext_key(ext);
+ u32 hash, k1;
+
+ if (set->klen == 4) {
+ k1 = *(u32 *)key;
+ hash = jhash_1word(k1, priv->seed);
+ } else {
+ hash = jhash(key, set->klen, priv->seed);
+ }
+ hash = reciprocal_scale(hash, priv->buckets);
+
+ return hash;
}
static int nft_hash_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
- struct nft_hash_elem *this = elem->priv, *he;
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
u32 hash;
- hash = jhash(nft_set_ext_key(&this->ext), set->klen, priv->seed);
- hash = reciprocal_scale(hash, priv->buckets);
+ hash = nft_jhash(set, priv, &this->ext);
hlist_for_each_entry(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&this->ext),
nft_set_ext_key(&he->ext), set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
+ *elem_priv = &he->priv;
return -EEXIST;
}
}
@@ -498,39 +696,38 @@ static int nft_hash_insert(const struct net *net, const struct nft_set *set,
}
static void nft_hash_activate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &he->ext);
+ nft_clear(net, &he->ext);
}
-static bool nft_hash_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_hash_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
nft_set_elem_change_active(net, set, &he->ext);
- return true;
}
-static void *nft_hash_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_hash_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_hash_elem *this = nft_elem_priv_cast(elem->priv), *he;
struct nft_hash *priv = nft_set_priv(set);
- struct nft_hash_elem *this = elem->priv, *he;
u8 genmask = nft_genmask_next(net);
u32 hash;
- hash = jhash(nft_set_ext_key(&this->ext), set->klen, priv->seed);
- hash = reciprocal_scale(hash, priv->buckets);
+ hash = nft_jhash(set, priv, &this->ext);
hlist_for_each_entry(he, &priv->table[hash], node) {
- if (!memcmp(nft_set_ext_key(&this->ext), &elem->key.val,
+ if (!memcmp(nft_set_ext_key(&he->ext), &elem->key.val,
set->klen) &&
nft_set_elem_active(&he->ext, genmask)) {
nft_set_elem_change_active(net, set, &he->ext);
- return he;
+ return &he->priv;
}
}
return NULL;
@@ -538,9 +735,9 @@ static void *nft_hash_deactivate(const struct net *net,
static void nft_hash_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_hash_elem *he = elem->priv;
+ struct nft_hash_elem *he = nft_elem_priv_cast(elem_priv);
hlist_del_rcu(&he->node);
}
@@ -550,19 +747,15 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
- struct nft_set_elem elem;
int i;
for (i = 0; i < priv->buckets; i++) {
- hlist_for_each_entry_rcu(he, &priv->table[i], node) {
+ hlist_for_each_entry_rcu(he, &priv->table[i], node,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&he->ext, iter->genmask))
- goto cont;
- elem.priv = he;
-
- iter->err = iter->fn(ctx, set, iter, &elem);
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
if (iter->err < 0)
return;
cont:
@@ -575,7 +768,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[],
const struct nft_set_desc *desc)
{
return sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head);
}
static int nft_hash_init(const struct nft_set *set,
@@ -590,7 +783,8 @@ static int nft_hash_init(const struct nft_set *set,
return 0;
}
-static void nft_hash_destroy(const struct nft_set *set)
+static void nft_hash_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
@@ -600,7 +794,7 @@ static void nft_hash_destroy(const struct nft_set *set)
for (i = 0; i < priv->buckets; i++) {
hlist_for_each_entry_safe(he, next, &priv->table[i], node) {
hlist_del_rcu(&he->node);
- nft_set_elem_destroy(set, he, true);
+ nf_tables_set_elem_destroy(ctx, set, &he->priv);
}
}
}
@@ -615,8 +809,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
@@ -624,7 +818,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
}
static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features,
- struct nft_set_estimate *est)
+ struct nft_set_estimate *est)
{
if (!desc->size)
return false;
@@ -633,16 +827,15 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return false;
est->size = sizeof(struct nft_hash) +
- nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
- desc->size * sizeof(struct nft_hash_elem);
+ (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) +
+ (u64)desc->size * sizeof(struct nft_hash_elem);
est->lookup = NFT_SET_CLASS_O_1;
est->space = NFT_SET_CLASS_O_N;
return true;
}
-struct nft_set_type nft_set_rhash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rhash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT | NFT_SET_EVAL,
.ops = {
@@ -659,13 +852,13 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
.remove = nft_rhash_remove,
.lookup = nft_rhash_lookup,
.update = nft_rhash_update,
+ .delete = nft_rhash_delete,
.walk = nft_rhash_walk,
.get = nft_rhash_get,
},
};
-struct nft_set_type nft_set_hash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
@@ -684,8 +877,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_fast_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_fast_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
new file mode 100644
index 000000000000..112fe46788b6
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.c
@@ -0,0 +1,2400 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+/**
+ * DOC: Theory of Operation
+ *
+ *
+ * Problem
+ * -------
+ *
+ * Match packet bytes against entries composed of ranged or non-ranged packet
+ * field specifiers, mapping them to arbitrary references. For example:
+ *
+ * ::
+ *
+ * --- fields --->
+ * | [net],[port],[net]... => [reference]
+ * entries [net],[port],[net]... => [reference]
+ * | [net],[port],[net]... => [reference]
+ * V ...
+ *
+ * where [net] fields can be IP ranges or netmasks, and [port] fields are port
+ * ranges. Arbitrary packet fields can be matched.
+ *
+ *
+ * Algorithm Overview
+ * ------------------
+ *
+ * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally
+ * relies on the consideration that every contiguous range in a space of b bits
+ * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010],
+ * as also illustrated in Section 9 of [Kogan 2014].
+ *
+ * Classification against a number of entries, that require matching given bits
+ * of a packet field, is performed by grouping those bits in sets of arbitrary
+ * size, and classifying packet bits one group at a time.
+ *
+ * Example:
+ * to match the source port (16 bits) of a packet, we can divide those 16 bits
+ * in 4 groups of 4 bits each. Given the entry:
+ * 0000 0001 0101 1001
+ * and a packet with source port:
+ * 0000 0001 1010 1001
+ * first and second groups match, but the third doesn't. We conclude that the
+ * packet doesn't match the given entry.
+ *
+ * Translate the set to a sequence of lookup tables, one per field. Each table
+ * has two dimensions: bit groups to be matched for a single packet field, and
+ * all the possible values of said groups (buckets). Input entries are
+ * represented as one or more rules, depending on the number of composing
+ * netmasks for the given field specifier, and a group match is indicated as a
+ * set bit, with number corresponding to the rule index, in all the buckets
+ * whose value matches the entry for a given group.
+ *
+ * Rules are mapped between fields through an array of x, n pairs, with each
+ * item mapping a matched rule to one or more rules. The position of the pair in
+ * the array indicates the matched rule to be mapped to the next field, x
+ * indicates the first rule index in the next field, and n the amount of
+ * next-field rules the current rule maps to.
+ *
+ * The mapping array for the last field maps to the desired references.
+ *
+ * To match, we perform table lookups using the values of grouped packet bits,
+ * and use a sequence of bitwise operations to progressively evaluate rule
+ * matching.
+ *
+ * A stand-alone, reference implementation, also including notes about possible
+ * future optimisations, is available at:
+ * https://pipapo.lameexcu.se/
+ *
+ * Insertion
+ * ---------
+ *
+ * - For each packet field:
+ *
+ * - divide the b packet bits we want to classify into groups of size t,
+ * obtaining ceil(b / t) groups
+ *
+ * Example: match on destination IP address, with t = 4: 32 bits, 8 groups
+ * of 4 bits each
+ *
+ * - allocate a lookup table with one column ("bucket") for each possible
+ * value of a group, and with one row for each group
+ *
+ * Example: 8 groups, 2^4 buckets:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0
+ * 1
+ * 2
+ * 3
+ * 4
+ * 5
+ * 6
+ * 7
+ *
+ * - map the bits we want to classify for the current field, for a given
+ * entry, to a single rule for non-ranged and netmask set items, and to one
+ * or multiple rules for ranges. Ranges are expanded to composing netmasks
+ * by pipapo_expand().
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
+ * - rule #0: 10.0.0.5
+ * - rule #1: 192.168.1.0/24
+ * - rule #2: 192.168.2.0/31
+ *
+ * - insert references to the rules in the lookup table, selecting buckets
+ * according to bit values of a rule in the given group. This is done by
+ * pipapo_insert().
+ *
+ * Example: given:
+ * - rule #0: 10.0.0.5 mapping to buckets
+ * < 0 10 0 0 0 0 0 5 >
+ * - rule #1: 192.168.1.0/24 mapping to buckets
+ * < 12 0 10 8 0 1 < 0..15 > < 0..15 > >
+ * - rule #2: 192.168.2.0/31 mapping to buckets
+ * < 12 0 10 8 0 2 0 < 0..1 > >
+ *
+ * these bits are set in the lookup table:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * - if this is not the last field in the set, fill a mapping array that maps
+ * rules from the lookup table to rules belonging to the same entry in
+ * the next lookup table, done by pipapo_map().
+ *
+ * Note that as rules map to contiguous ranges of rules, given how netmask
+ * expansion and insertion is performed, &union nft_pipapo_map_bucket stores
+ * this information as pairs of first rule index, rule count.
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048,
+ * given lookup table #0 for field 0 (see example above):
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * and lookup table #1 for field 1 with:
+ * - rule #0: 1024 mapping to buckets
+ * < 0 0 4 0 >
+ * - rule #1: 2048 mapping to buckets
+ * < 0 0 5 0 >
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
+ * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
+ * (rules #1, #2) to 2048 in lookup table #2 (rule #1):
+ *
+ * ::
+ *
+ * rule indices in current field: 0 1 2
+ * map to rules in next field: 0 1 1
+ *
+ * - if this is the last field in the set, fill a mapping array that maps
+ * rules from the last lookup table to element pointers, also done by
+ * pipapo_map().
+ *
+ * Note that, in this implementation, we have two elements (start, end) for
+ * each entry. The pointer to the end element is stored in this array, and
+ * the pointer to the start element is linked from it.
+ *
+ * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem
+ * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42.
+ * From the rules of lookup table #1 as mapped above:
+ *
+ * ::
+ *
+ * rule indices in last field: 0 1
+ * map to elements: 0x66 0x42
+ *
+ *
+ * Matching
+ * --------
+ *
+ * We use a result bitmap, with the size of a single lookup table bucket, to
+ * represent the matching state that applies at every algorithm step. This is
+ * done by pipapo_lookup().
+ *
+ * - For each packet field:
+ *
+ * - start with an all-ones result bitmap (res_map in pipapo_lookup())
+ *
+ * - perform a lookup into the table corresponding to the current field,
+ * for each group, and at every group, AND the current result bitmap with
+ * the value from the lookup table bucket
+ *
+ * ::
+ *
+ * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from
+ * insertion examples.
+ * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for
+ * convenience in this example. Initial result bitmap is 0xff, the steps
+ * below show the value of the result bitmap after each group is processed:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6
+ *
+ * 1 1,2 0
+ * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6
+ *
+ * 2 0 1,2
+ * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6
+ *
+ * 3 0 1,2
+ * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6
+ *
+ * 4 0,1,2
+ * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6
+ *
+ * 5 0 1 2
+ * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2
+ *
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2
+ *
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2
+ *
+ * - at the next field, start with a new, all-zeroes result bitmap. For each
+ * bit set in the previous result bitmap, fill the new result bitmap
+ * (fill_map in pipapo_lookup()) with the rule indices from the
+ * corresponding buckets of the mapping field for this field, done by
+ * pipapo_refill()
+ *
+ * Example: with mapping table from insertion examples, with the current
+ * result bitmap from the previous example, 0x02:
+ *
+ * ::
+ *
+ * rule indices in current field: 0 1 2
+ * map to rules in next field: 0 1 1
+ *
+ * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be
+ * set.
+ *
+ * We can now extend this example to cover the second iteration of the step
+ * above (lookup and AND bitmap): assuming the port field is
+ * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table
+ * for "port" field from pre-computation example:
+ *
+ * ::
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5]
+ * & 0x3 [bucket 0], resulting bitmap is 0x2.
+ *
+ * - if this is the last field in the set, look up the value from the mapping
+ * array corresponding to the final result bitmap
+ *
+ * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for
+ * last field from insertion example:
+ *
+ * ::
+ *
+ * rule indices in last field: 0 1
+ * map to elements: 0x66 0x42
+ *
+ * the matching element is at 0x42.
+ *
+ *
+ * References
+ * ----------
+ *
+ * [Ligatti 2010]
+ * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with
+ * Automatic Time-space Tradeoffs
+ * Jay Ligatti, Josh Kuhn, and Chris Gage.
+ * Proceedings of the IEEE International Conference on Computer
+ * Communication Networks (ICCCN), August 2010.
+ * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
+ *
+ * [Rottenstreich 2010]
+ * Worst-Case TCAM Rule Expansion
+ * Ori Rottenstreich and Isaac Keslassy.
+ * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010.
+ * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf
+ *
+ * [Kogan 2014]
+ * SAX-PAC (Scalable And eXpressive PAcket Classification)
+ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
+ * and Patrick Eugster.
+ * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
+ * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+/**
+ * pipapo_refill() - For each set bit, set bits from selected mapping table item
+ * @map: Bitmap to be scanned for set bits
+ * @len: Length of bitmap in longs
+ * @rules: Number of rules in field
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @match_only: Find a single bit and return, don't fill
+ *
+ * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain.
+ *
+ * For each bit set in map, select the bucket from mapping table with index
+ * corresponding to the position of the bit set. Use start bit and amount of
+ * bits specified in bucket to fill region in dst.
+ *
+ * Return: -1 on no match, bit position on 'match_only', 0 otherwise.
+ */
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only)
+{
+ unsigned long bitset;
+ unsigned int k;
+ int ret = -1;
+
+ for (k = 0; k < len; k++) {
+ bitset = map[k];
+ while (bitset) {
+ unsigned long t = bitset & -bitset;
+ int r = __builtin_ctzl(bitset);
+ int i = k * BITS_PER_LONG + r;
+
+ if (unlikely(i >= rules)) {
+ map[k] = 0;
+ return -1;
+ }
+
+ if (match_only) {
+ bitmap_clear(map, i, 1);
+ return i;
+ }
+
+ ret = 0;
+
+ bitmap_set(dst, mt[i].to, mt[i].n);
+
+ bitset ^= t;
+ }
+ map[k] = 0;
+ }
+
+ return ret;
+}
+
+/**
+ * pipapo_get_slow() - Get matching element reference given key data
+ * @m: storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: timestamp to check for expired elements
+ *
+ * For more details, see DOC: Theory of Operation.
+ *
+ * This is the main lookup function. It matches key data against either
+ * the working match set or the uncommitted copy, depending on what the
+ * caller passed to us.
+ * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup
+ * (datapath lookup) pass the active copy.
+ * The insertion path will pass the uncommitted working copy.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
+ */
+static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
+{
+ unsigned long *res_map, *fill_map, *map;
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
+ bool map_index;
+ int i;
+
+ local_bh_disable();
+
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ goto out;
+ __local_lock_nested_bh(&scratch->bh_lock);
+
+ map_index = scratch->map_index;
+
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res_map = map + (map_index ? m->bsize_max : 0);
+ fill_map = map + (map_index ? 0 : m->bsize_max);
+
+ pipapo_resmap_init(m, res_map);
+
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+ int b;
+
+ /* For each bit group: select lookup table bucket depending on
+ * packet bytes value, then AND bucket value
+ */
+ if (likely(f->bb == 8))
+ pipapo_and_field_buckets_8bit(f, res_map, data);
+ else
+ pipapo_and_field_buckets_4bit(f, res_map, data);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+
+ /* Now populate the bitmap for the next field, unless this is
+ * the last field, in which case return the matched 'ext'
+ * pointer if any.
+ *
+ * Now res_map contains the matching bitmap, and fill_map is the
+ * bitmap for the next field.
+ */
+next_match:
+ b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
+ last);
+ if (b < 0) {
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ local_bh_enable();
+
+ return NULL;
+ }
+
+ if (last) {
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[b].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask)))
+ goto next_match;
+
+ /* Last field: we're just returning the key without
+ * filling the initial bitmap for the next field, so the
+ * current inactive bitmap is clean and can be reused as
+ * *next* bitmap (not initial) for the next packet.
+ */
+ scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ local_bh_enable();
+ return e;
+ }
+
+ /* Swap bitmap indices: res_map is the initial bitmap for the
+ * next field, and fill_map is guaranteed to be all-zeroes at
+ * this point.
+ */
+ map_index = !map_index;
+ swap(res_map, fill_map);
+
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
+ }
+
+ __local_unlock_nested_bh(&scratch->bh_lock);
+out:
+ local_bh_enable();
+ return NULL;
+}
+
+/**
+ * pipapo_get() - Get matching element reference given key data
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
+ *
+ * This is a dispatcher function, either calling out the generic C
+ * implementation or, if available, the AVX2 one.
+ * This helper is only called from the control plane, with either RCU
+ * read lock or transaction mutex held.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
+ */
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
+{
+ struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) {
+ e = pipapo_get_avx2(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+ }
+#endif
+ e = pipapo_get_slow(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+}
+
+/**
+ * nft_pipapo_lookup() - Dataplane fronted for main lookup function
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: pointer to nft registers containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy.
+ * Unlike other set types, this uses 0 instead of nft_genmask_cur().
+ *
+ * This is because new (future) elements are not reachable from
+ * priv->match, they get added to priv->clone instead.
+ * When the commit phase flips the generation bitmask, the
+ * 'now old' entries are skipped but without the 'now current'
+ * elements becoming visible. Using nft_genmask_cur() thus creates
+ * inconsistent state: matching old entries get skipped but thew
+ * newly matching entries are unreachable.
+ *
+ * GENMASK_ANY doesn't work for the same reason: old-gen entries get
+ * skipped, new-gen entries are only reachable from priv->clone.
+ *
+ * nft_pipapo_commit swaps ->clone and ->match shortly after the
+ * genbit flip. As ->clone doesn't contain the old entries in the first
+ * place, lookup will only find the now-current ones.
+ *
+ * Return: ntables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const struct nft_pipapo_elem *e;
+
+ m = rcu_dereference(priv->match);
+ e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64());
+
+ return e ? &e->ext : NULL;
+}
+
+/**
+ * nft_pipapo_get() - Get matching element reference given key data
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @flags: Unused
+ *
+ * This function is called from the control plane path under
+ * RCU read lock.
+ *
+ * Return: set element private pointer or ERR_PTR(-ENOENT).
+ */
+static struct nft_elem_priv *
+nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = rcu_dereference(priv->match);
+ struct nft_pipapo_elem *e;
+
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net), get_jiffies_64());
+ if (!e)
+ return ERR_PTR(-ENOENT);
+
+ return &e->priv;
+}
+
+/**
+ * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize
+ * @f: Field containing mapping table
+ * @old_rules: Amount of existing mapped rules
+ * @rules: Amount of new rules to map
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int pipapo_realloc_mt(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
+{
+ union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt;
+ const unsigned int extra = PAGE_SIZE / sizeof(*new_mt);
+ unsigned int rules_alloc = rules;
+
+ might_sleep();
+
+ if (unlikely(rules == 0))
+ goto out_free;
+
+ /* growing and enough space left, no action needed */
+ if (rules > old_rules && f->rules_alloc > rules)
+ return 0;
+
+ /* downsize and extra slack has not grown too large */
+ if (rules < old_rules) {
+ unsigned int remove = f->rules_alloc - rules;
+
+ if (remove < (2u * extra))
+ return 0;
+ }
+
+ /* If set needs more than one page of memory for rules then
+ * allocate another extra page to avoid frequent reallocation.
+ */
+ if (rules > extra &&
+ check_add_overflow(rules, extra, &rules_alloc))
+ return -EOVERFLOW;
+
+ if (rules_alloc > (INT_MAX / sizeof(*new_mt)))
+ return -ENOMEM;
+
+ new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT);
+ if (!new_mt)
+ return -ENOMEM;
+
+ if (old_mt)
+ memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt));
+
+ if (rules > old_rules) {
+ memset(new_mt + old_rules, 0,
+ (rules - old_rules) * sizeof(*new_mt));
+ }
+out_free:
+ f->rules_alloc = rules_alloc;
+ f->mt = new_mt;
+
+ kvfree(old_mt);
+
+ return 0;
+}
+
+
+/**
+ * lt_calculate_size() - Get storage size for lookup table with overflow check
+ * @groups: Amount of bit groups
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @bsize: Size of each bucket in lookup table, in longs
+ *
+ * Return: allocation size including alignment overhead, negative on overflow
+ */
+static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb,
+ unsigned int bsize)
+{
+ ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long);
+
+ if (check_mul_overflow(ret, bsize, &ret))
+ return -1;
+ if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret))
+ return -1;
+ if (ret > INT_MAX)
+ return -1;
+
+ return ret;
+}
+
+/**
+ * pipapo_resize() - Resize lookup or mapping table, or both
+ * @f: Field containing lookup and mapping tables
+ * @old_rules: Previous amount of rules in field
+ * @rules: New amount of rules
+ *
+ * Increase, decrease or maintain tables size depending on new amount of rules,
+ * and copy data over. In case the new size is smaller, throw away data for
+ * highest-numbered rules.
+ *
+ * Return: 0 on success, -ENOMEM on allocation failure.
+ */
+static int pipapo_resize(struct nft_pipapo_field *f,
+ unsigned int old_rules, unsigned int rules)
+{
+ long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p;
+ unsigned int new_bucket_size, copy;
+ int group, bucket, err;
+ ssize_t lt_size;
+
+ if (rules >= NFT_PIPAPO_RULE0_MAX)
+ return -ENOSPC;
+
+ new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
+#ifdef NFT_PIPAPO_ALIGN
+ new_bucket_size = roundup(new_bucket_size,
+ NFT_PIPAPO_ALIGN / sizeof(*new_lt));
+#endif
+
+ if (new_bucket_size == f->bsize)
+ goto mt;
+
+ if (new_bucket_size > f->bsize)
+ copy = f->bsize;
+ else
+ copy = new_bucket_size;
+
+ lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size);
+ if (lt_size < 0)
+ return -ENOMEM;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
+ if (!new_lt)
+ return -ENOMEM;
+
+ new_p = NFT_PIPAPO_LT_ALIGN(new_lt);
+ old_p = NFT_PIPAPO_LT_ALIGN(old_lt);
+
+ for (group = 0; group < f->groups; group++) {
+ for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) {
+ memcpy(new_p, old_p, copy * sizeof(*new_p));
+ new_p += copy;
+ old_p += copy;
+
+ if (new_bucket_size > f->bsize)
+ new_p += new_bucket_size - f->bsize;
+ else
+ old_p += f->bsize - new_bucket_size;
+ }
+ }
+
+mt:
+ err = pipapo_realloc_mt(f, old_rules, rules);
+ if (err) {
+ kvfree(new_lt);
+ return err;
+ }
+
+ if (new_lt) {
+ f->bsize = new_bucket_size;
+ f->lt = new_lt;
+ kvfree(old_lt);
+ }
+
+ return 0;
+}
+
+/**
+ * pipapo_bucket_set() - Set rule bit in bucket given group and group value
+ * @f: Field containing lookup table
+ * @rule: Rule index
+ * @group: Group index
+ * @v: Value of bit group
+ */
+static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
+ int v)
+{
+ unsigned long *pos;
+
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt);
+ pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group;
+ pos += f->bsize * v;
+
+ __set_bit(rule, pos);
+}
+
+/**
+ * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit intersection between:
+ * - bucket with index given by the upper 4 bits of b, from group g, and
+ * - bucket with index given by the lower 4 bits of b, from group g + 1
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * N(b, g) := O(b / 16, g) & O(b % 16, g + 1)
+ *
+ * This ensures equivalence of the matching results on lookup. Two examples in
+ * pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255
+ * 0 ^
+ * 1 | ^
+ * ... ( & ) |
+ * / \ |
+ * / \ .-( & )-.
+ * / bucket \ | |
+ * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 |
+ * 0 / \ | |
+ * 1 \ | |
+ * 2 | --'
+ * 3 '-
+ * ...
+ */
+static void pipapo_lt_4b_to_8b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, i;
+
+ for (g = 0; g < old_groups / 2; g++) {
+ int src_g0 = g * 2, src_g1 = g * 2 + 1;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) {
+ int src_b0 = b / NFT_PIPAPO_BUCKETS(4);
+ int src_b1 = b % NFT_PIPAPO_BUCKETS(4);
+ int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0;
+ int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1;
+
+ for (i = 0; i < bsize; i++) {
+ *new_lt = old_lt[src_i0 * bsize + i] &
+ old_lt[src_i1 * bsize + i];
+ new_lt++;
+ }
+ }
+ }
+}
+
+/**
+ * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit union of:
+ * - all the buckets with index such that the upper four bits of the lower byte
+ * equal b, from group g, with g odd
+ * - all the buckets with index such that the lower four bits equal b, from
+ * group g, with g even
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4)
+ * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f)
+ *
+ * where U() denotes the arbitrary union operation (binary OR of n terms). This
+ * ensures equivalence of the matching results on lookup.
+ */
+static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, bsrc, i;
+
+ memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize *
+ sizeof(unsigned long));
+
+ for (g = 0; g < old_groups * 2; g += 2) {
+ int src_g = g / 2;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if (((bsrc & 0xf0) >> 4) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if ((bsrc & 0x0f) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+ }
+}
+
+/**
+ * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed
+ * @f: Field containing lookup table
+ */
+static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
+{
+ unsigned int groups, bb;
+ unsigned long *new_lt;
+ ssize_t lt_size;
+
+ lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
+ sizeof(*f->lt);
+
+ if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET &&
+ lt_size > NFT_PIPAPO_LT_SIZE_HIGH) {
+ groups = f->groups * 2;
+ bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
+
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
+ } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
+ lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
+ groups = f->groups / 2;
+ bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
+
+ lt_size = lt_calculate_size(groups, bb, f->bsize);
+ if (lt_size < 0)
+ return;
+
+ /* Don't increase group width if the resulting lookup table size
+ * would exceed the upper size threshold for a "small" set.
+ */
+ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH)
+ return;
+ } else {
+ return;
+ }
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
+ if (!new_lt)
+ return;
+
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+ if (f->bb == 4 && bb == 8) {
+ pipapo_lt_4b_to_8b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else if (f->bb == 8 && bb == 4) {
+ pipapo_lt_8b_to_4b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else {
+ BUG();
+ }
+
+ f->groups = groups;
+ f->bb = bb;
+ kvfree(f->lt);
+ f->lt = new_lt;
+}
+
+/**
+ * pipapo_insert() - Insert new rule in field given input key and mask length
+ * @f: Field containing lookup table
+ * @k: Input key for classification, without nftables padding
+ * @mask_bits: Length of mask; matches field length for non-ranged entry
+ *
+ * Insert a new rule reference in lookup buckets corresponding to k and
+ * mask_bits.
+ *
+ * Return: 1 on success (one rule inserted), negative error code on failure.
+ */
+static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
+ int mask_bits)
+{
+ unsigned int rule = f->rules, group, ret, bit_offset = 0;
+
+ ret = pipapo_resize(f, f->rules, f->rules + 1);
+ if (ret)
+ return ret;
+
+ f->rules++;
+
+ for (group = 0; group < f->groups; group++) {
+ int i, v;
+ u8 mask;
+
+ v = k[group / (BITS_PER_BYTE / f->bb)];
+ v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0);
+ v >>= (BITS_PER_BYTE - bit_offset) - f->bb;
+
+ bit_offset += f->bb;
+ bit_offset %= BITS_PER_BYTE;
+
+ if (mask_bits >= (group + 1) * f->bb) {
+ /* Not masked */
+ pipapo_bucket_set(f, rule, group, v);
+ } else if (mask_bits <= group * f->bb) {
+ /* Completely masked */
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++)
+ pipapo_bucket_set(f, rule, group, i);
+ } else {
+ /* The mask limit falls on this group */
+ mask = GENMASK(f->bb - 1, 0);
+ mask >>= mask_bits - group * f->bb;
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) {
+ if ((i & ~mask) == (v & ~mask))
+ pipapo_bucket_set(f, rule, group, i);
+ }
+ }
+ }
+
+ pipapo_lt_bits_adjust(f);
+
+ return 1;
+}
+
+/**
+ * pipapo_step_diff() - Check if setting @step bit in netmask would change it
+ * @base: Mask we are expanding
+ * @step: Step bit for given expansion step
+ * @len: Total length of mask space (set and unset bits), bytes
+ *
+ * Convenience function for mask expansion.
+ *
+ * Return: true if step bit changes mask (i.e. isn't set), false otherwise.
+ */
+static bool pipapo_step_diff(u8 *base, int step, int len)
+{
+ /* Network order, byte-addressed */
+#ifdef __BIG_ENDIAN__
+ return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]);
+#else
+ return !(BIT(step % BITS_PER_BYTE) &
+ base[len - 1 - step / BITS_PER_BYTE]);
+#endif
+}
+
+/**
+ * pipapo_step_after_end() - Check if mask exceeds range end with given step
+ * @base: Mask we are expanding
+ * @end: End of range
+ * @step: Step bit for given expansion step, highest bit to be set
+ * @len: Total length of mask space (set and unset bits), bytes
+ *
+ * Convenience function for mask expansion.
+ *
+ * Return: true if mask exceeds range setting step bits, false otherwise.
+ */
+static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step,
+ int len)
+{
+ u8 tmp[NFT_PIPAPO_MAX_BYTES];
+ int i;
+
+ memcpy(tmp, base, len);
+
+ /* Network order, byte-addressed */
+ for (i = 0; i <= step; i++)
+#ifdef __BIG_ENDIAN__
+ tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE);
+#else
+ tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE);
+#endif
+
+ return memcmp(tmp, end, len) > 0;
+}
+
+/**
+ * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry
+ * @base: Netmask base
+ * @step: Step bit to sum
+ * @len: Netmask length, bytes
+ */
+static void pipapo_base_sum(u8 *base, int step, int len)
+{
+ bool carry = false;
+ int i;
+
+ /* Network order, byte-addressed */
+#ifdef __BIG_ENDIAN__
+ for (i = step / BITS_PER_BYTE; i < len; i++) {
+#else
+ for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) {
+#endif
+ if (carry)
+ base[i]++;
+ else
+ base[i] += 1 << (step % BITS_PER_BYTE);
+
+ if (base[i])
+ break;
+
+ carry = true;
+ }
+}
+
+/**
+ * pipapo_expand() - Expand to composing netmasks, insert into lookup table
+ * @f: Field containing lookup table
+ * @start: Start of range
+ * @end: End of range
+ * @len: Length of value in bits
+ *
+ * Expand range to composing netmasks and insert corresponding rule references
+ * in lookup buckets.
+ *
+ * Return: number of inserted rules on success, negative error code on failure.
+ */
+static int pipapo_expand(struct nft_pipapo_field *f,
+ const u8 *start, const u8 *end, int len)
+{
+ int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE);
+ u8 base[NFT_PIPAPO_MAX_BYTES];
+
+ memcpy(base, start, bytes);
+ while (memcmp(base, end, bytes) <= 0) {
+ int err;
+
+ step = 0;
+ while (pipapo_step_diff(base, step, bytes)) {
+ if (pipapo_step_after_end(base, end, step, bytes))
+ break;
+
+ step++;
+ if (step >= len) {
+ if (!masks) {
+ err = pipapo_insert(f, base, 0);
+ if (err < 0)
+ return err;
+ masks = 1;
+ }
+ goto out;
+ }
+ }
+
+ err = pipapo_insert(f, base, len - step);
+
+ if (err < 0)
+ return err;
+
+ masks++;
+ pipapo_base_sum(base, step, bytes);
+ }
+out:
+ return masks;
+}
+
+/**
+ * pipapo_map() - Insert rules in mapping tables, mapping them between fields
+ * @m: Matching data, including mapping table
+ * @map: Table of rule maps: array of first rule and amount of rules
+ * in next field a given rule maps to, for each field
+ * @e: For last field, nft_set_ext pointer matching rules map to
+ */
+static void pipapo_map(struct nft_pipapo_match *m,
+ union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS],
+ struct nft_pipapo_elem *e)
+{
+ struct nft_pipapo_field *f;
+ int i, j;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) {
+ for (j = 0; j < map[i].n; j++) {
+ f->mt[map[i].to + j].to = map[i + 1].to;
+ f->mt[map[i].to + j].n = map[i + 1].n;
+ }
+ }
+
+ /* Last field: map to ext instead of mapping to next field */
+ for (j = 0; j < map[i].n; j++)
+ f->mt[map[i].to + j].e = e;
+}
+
+/**
+ * pipapo_free_scratch() - Free per-CPU map at original address
+ * @m: Matching data
+ * @cpu: CPU number
+ */
+static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
+{
+ struct nft_pipapo_scratch *s;
+
+ s = *per_cpu_ptr(m->scratch, cpu);
+
+ kvfree(s);
+}
+
+/**
+ * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results
+ * @clone: Copy of matching data with pending insertions and deletions
+ * @bsize_max: Maximum bucket size, scratch maps cover two buckets
+ *
+ * Return: 0 on success, -ENOMEM on failure.
+ */
+static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
+ unsigned long bsize_max)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct nft_pipapo_scratch *scratch;
+
+ scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL_ACCOUNT, cpu_to_node(i));
+ if (!scratch) {
+ /* On failure, there's no need to undo previous
+ * allocations: this means that some scratch maps have
+ * a bigger allocated size now (this is only called on
+ * insertion), but the extra space won't be used by any
+ * CPU as new elements are not inserted and m->bsize_max
+ * is not updated.
+ */
+ return -ENOMEM;
+ }
+
+ pipapo_free_scratch(clone, i);
+ local_lock_init(&scratch->bh_lock);
+ *per_cpu_ptr(clone->scratch, i) = scratch;
+ }
+
+ return 0;
+}
+
+static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ const struct net *net = read_pnet(&set->net);
+
+ return lockdep_is_held(&nft_pernet(net)->commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old);
+
+/**
+ * pipapo_maybe_clone() - Build clone for pending data changes, if not existing
+ * @set: nftables API set representation
+ *
+ * Return: newly created or existing clone, if any. NULL on allocation failure
+ */
+static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+
+ if (priv->clone)
+ return priv->clone;
+
+ m = rcu_dereference_protected(priv->match,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = pipapo_clone(m);
+
+ return priv->clone;
+}
+
+/**
+ * nft_pipapo_insert() - Validate and insert ranged elements
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element
+ *
+ * Return: 0 on success, error pointer on failure.
+ */
+static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem,
+ struct nft_elem_priv **elem_priv)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ const u8 *start = (const u8 *)elem->key.val.data, *end;
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
+ u8 genmask = nft_genmask_next(net);
+ struct nft_pipapo_elem *e, *dup;
+ u64 tstamp = nft_net_tstamp(net);
+ struct nft_pipapo_field *f;
+ const u8 *start_p, *end_p;
+ int i, bsize_max, err = 0;
+
+ if (!m)
+ return -ENOMEM;
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
+ end = (const u8 *)nft_set_ext_key_end(ext)->data;
+ else
+ end = start;
+
+ dup = pipapo_get(m, start, genmask, tstamp);
+ if (dup) {
+ /* Check if we already have the same exact entry */
+ const struct nft_data *dup_key, *dup_end;
+
+ dup_key = nft_set_ext_key(&dup->ext);
+ if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END))
+ dup_end = nft_set_ext_key_end(&dup->ext);
+ else
+ dup_end = dup_key;
+
+ if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
+ !memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
+ *elem_priv = &dup->priv;
+ return -EEXIST;
+ }
+
+ return -ENOTEMPTY;
+ }
+
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(m, end, nft_genmask_next(net), tstamp);
+ if (dup) {
+ *elem_priv = &dup->priv;
+ return -ENOTEMPTY;
+ }
+
+ /* Validate */
+ start_p = start;
+ end_p = end;
+
+ /* some helpers return -1, or 0 >= for valid rule pos,
+ * so we cannot support more than INT_MAX rules at this time.
+ */
+ BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX);
+
+ nft_pipapo_for_each_field(f, i, m) {
+ if (f->rules >= NFT_PIPAPO_RULE0_MAX)
+ return -ENOSPC;
+
+ if (memcmp(start_p, end_p,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0)
+ return -EINVAL;
+
+ start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+ /* Insert */
+ bsize_max = m->bsize_max;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ int ret;
+
+ rulemap[i].to = f->rules;
+
+ ret = memcmp(start, end,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
+ if (!ret)
+ ret = pipapo_insert(f, start, f->groups * f->bb);
+ else
+ ret = pipapo_expand(f, start, end, f->groups * f->bb);
+
+ if (ret < 0)
+ return ret;
+
+ if (f->bsize > bsize_max)
+ bsize_max = f->bsize;
+
+ rulemap[i].n = ret;
+
+ start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+ if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
+ put_cpu_ptr(m->scratch);
+
+ err = pipapo_realloc_scratch(m, bsize_max);
+ if (err)
+ return err;
+
+ m->bsize_max = bsize_max;
+ } else {
+ put_cpu_ptr(m->scratch);
+ }
+
+ e = nft_elem_priv_cast(elem->priv);
+ *elem_priv = &e->priv;
+
+ pipapo_map(m, rulemap, e);
+
+ return 0;
+}
+
+/**
+ * pipapo_clone() - Clone matching data to create new working copy
+ * @old: Existing matching data
+ *
+ * Return: copy of matching data passed as 'old' or NULL.
+ */
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
+{
+ struct nft_pipapo_field *dst, *src;
+ struct nft_pipapo_match *new;
+ int i;
+
+ new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return NULL;
+
+ new->field_count = old->field_count;
+ new->bsize_max = old->bsize_max;
+
+ new->scratch = alloc_percpu(*new->scratch);
+ if (!new->scratch)
+ goto out_scratch;
+
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(new->scratch, i) = NULL;
+
+ if (pipapo_realloc_scratch(new, old->bsize_max))
+ goto out_scratch_realloc;
+
+ rcu_head_init(&new->rcu);
+
+ src = old->f;
+ dst = new->f;
+
+ for (i = 0; i < old->field_count; i++) {
+ unsigned long *new_lt;
+ ssize_t lt_size;
+
+ memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
+
+ lt_size = lt_calculate_size(src->groups, src->bb, src->bsize);
+ if (lt_size < 0)
+ goto out_lt;
+
+ new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT);
+ if (!new_lt)
+ goto out_lt;
+
+ dst->lt = new_lt;
+
+ memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
+ NFT_PIPAPO_LT_ALIGN(src->lt),
+ src->bsize * sizeof(*dst->lt) *
+ src->groups * NFT_PIPAPO_BUCKETS(src->bb));
+
+ if (src->rules > 0) {
+ if (src->rules_alloc > (INT_MAX / sizeof(*src->mt)))
+ goto out_mt;
+
+ dst->mt = kvmalloc_array(src->rules_alloc,
+ sizeof(*src->mt),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst->mt)
+ goto out_mt;
+
+ memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt));
+ } else {
+ dst->mt = NULL;
+ dst->rules_alloc = 0;
+ }
+
+ src++;
+ dst++;
+ }
+
+ return new;
+
+out_mt:
+ kvfree(dst->lt);
+out_lt:
+ for (dst--; i > 0; i--) {
+ kvfree(dst->mt);
+ kvfree(dst->lt);
+ dst--;
+ }
+out_scratch_realloc:
+ for_each_possible_cpu(i)
+ pipapo_free_scratch(new, i);
+out_scratch:
+ free_percpu(new->scratch);
+ kfree(new);
+
+ return NULL;
+}
+
+/**
+ * pipapo_rules_same_key() - Get number of rules originated from the same entry
+ * @f: Field containing mapping table
+ * @first: Index of first rule in set of rules mapping to same entry
+ *
+ * Using the fact that all rules in a field that originated from the same entry
+ * will map to the same set of rules in the next field, or to the same element
+ * reference, return the cardinality of the set of rules that originated from
+ * the same entry as the rule with index @first, @first rule included.
+ *
+ * In pictures:
+ * rules
+ * field #0 0 1 2 3 4
+ * map to: 0 1 2-4 2-4 5-9
+ * . . ....... . ...
+ * | | | | \ \
+ * | | | | \ \
+ * | | | | \ \
+ * ' ' ' ' ' \
+ * in field #1 0 1 2 3 4 5 ...
+ *
+ * if this is called for rule 2 on field #0, it will return 3, as also rules 2
+ * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field.
+ *
+ * For the last field in a set, we can rely on associated entries to map to the
+ * same element references.
+ *
+ * Return: Number of rules that originated from the same entry as @first.
+ */
+static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first)
+{
+ struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */
+ unsigned int r;
+
+ for (r = first; r < f->rules; r++) {
+ if (r != first && e != f->mt[r].e)
+ return r - first;
+
+ e = f->mt[r].e;
+ }
+
+ if (r != first)
+ return r - first;
+
+ return 0;
+}
+
+/**
+ * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones
+ * @mt: Mapping array
+ * @rules: Original amount of rules in mapping table
+ * @start: First rule index to be removed
+ * @n: Amount of rules to be removed
+ * @to_offset: First rule index, in next field, this group of rules maps to
+ * @is_last: If this is the last field, delete reference from mapping array
+ *
+ * This is used to unmap rules from the mapping table for a single field,
+ * maintaining consistency and compactness for the existing ones.
+ *
+ * In pictures: let's assume that we want to delete rules 2 and 3 from the
+ * following mapping array:
+ *
+ * rules
+ * 0 1 2 3 4
+ * map to: 4-10 4-10 11-15 11-15 16-18
+ *
+ * the result will be:
+ *
+ * rules
+ * 0 1 2
+ * map to: 4-10 4-10 11-13
+ *
+ * for fields before the last one. In case this is the mapping table for the
+ * last field in a set, and rules map to pointers to &struct nft_pipapo_elem:
+ *
+ * rules
+ * 0 1 2 3 4
+ * element pointers: 0x42 0x42 0x33 0x33 0x44
+ *
+ * the result will be:
+ *
+ * rules
+ * 0 1 2
+ * element pointers: 0x42 0x42 0x44
+ */
+static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules,
+ unsigned int start, unsigned int n,
+ unsigned int to_offset, bool is_last)
+{
+ int i;
+
+ memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt));
+ memset(mt + rules - n, 0, n * sizeof(*mt));
+
+ if (is_last)
+ return;
+
+ for (i = start; i < rules - n; i++)
+ mt[i].to -= to_offset;
+}
+
+/**
+ * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map
+ * @m: Matching data
+ * @rulemap: Table of rule maps, arrays of first rule and amount of rules
+ * in next field a given entry maps to, for each field
+ *
+ * For each rule in lookup table buckets mapping to this set of rules, drop
+ * all bits set in lookup table mapping. In pictures, assuming we want to drop
+ * rules 0 and 1 from this lookup table:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * rule 2 becomes rule 0, and the result will be:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0
+ * 1 0
+ * 2 0
+ * 3 0
+ * 4 0
+ * 5 0
+ * 6 0
+ * 7 0 0
+ *
+ * once this is done, call unmap() to drop all the corresponding rule references
+ * from mapping tables.
+ */
+static void pipapo_drop(struct nft_pipapo_match *m,
+ union nft_pipapo_map_bucket rulemap[])
+{
+ struct nft_pipapo_field *f;
+ int i;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ int g;
+
+ for (g = 0; g < f->groups; g++) {
+ unsigned long *pos;
+ int b;
+
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g *
+ NFT_PIPAPO_BUCKETS(f->bb) * f->bsize;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
+ bitmap_cut(pos, pos, rulemap[i].to,
+ rulemap[i].n,
+ f->bsize * BITS_PER_LONG);
+
+ pos += f->bsize;
+ }
+ }
+
+ pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n,
+ rulemap[i + 1].n, i == m->field_count - 1);
+ if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) {
+ /* We can ignore this, a failure to shrink tables down
+ * doesn't make tables invalid.
+ */
+ ;
+ }
+ f->rules -= rulemap[i].n;
+
+ pipapo_lt_bits_adjust(f);
+ }
+}
+
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+ struct nft_pipapo_elem *e)
+
+{
+ nft_setelem_data_deactivate(net, set, &e->priv);
+}
+
+/**
+ * pipapo_gc() - Drop expired entries from set, destroy start and end elements
+ * @set: nftables API set representation
+ * @m: Matching data
+ */
+static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct net *net = read_pnet(&set->net);
+ unsigned int rules_f0, first_rule = 0;
+ u64 tstamp = nft_net_tstamp(net);
+ struct nft_pipapo_elem *e;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ const struct nft_pipapo_field *f;
+ unsigned int i, start, rules_fx;
+
+ start = first_rule;
+ rules_fx = rules_f0;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ rulemap[i].to = start;
+ rulemap[i].n = rules_fx;
+
+ if (i < m->field_count - 1) {
+ rules_fx = f->mt[start].n;
+ start = f->mt[start].to;
+ }
+ }
+
+ /* Pick the last field, and its last index */
+ f--;
+ i--;
+ e = f->mt[rulemap[i].to].e;
+
+ /* synchronous gc never fails, there is no need to set on
+ * NFT_SET_ELEM_DEAD_BIT.
+ */
+ if (__nft_set_elem_expired(&e->ext, tstamp)) {
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ nft_pipapo_gc_deactivate(net, set, e);
+ pipapo_drop(m, rulemap);
+ nft_trans_gc_elem_add(gc, e);
+
+ /* And check again current first rule, which is now the
+ * first we haven't checked.
+ */
+ } else {
+ first_rule += rules_f0;
+ }
+ }
+
+ gc = nft_trans_gc_catchall_sync(gc);
+ if (gc) {
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
+}
+
+/**
+ * pipapo_free_fields() - Free per-field tables contained in matching data
+ * @m: Matching data
+ */
+static void pipapo_free_fields(struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ int i;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ kvfree(f->lt);
+ kvfree(f->mt);
+ }
+}
+
+static void pipapo_free_match(struct nft_pipapo_match *m)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ pipapo_free_scratch(m, i);
+
+ free_percpu(m->scratch);
+ pipapo_free_fields(m);
+
+ kfree(m);
+}
+
+/**
+ * pipapo_reclaim_match - RCU callback to free fields from old matching data
+ * @rcu: RCU head
+ */
+static void pipapo_reclaim_match(struct rcu_head *rcu)
+{
+ struct nft_pipapo_match *m;
+
+ m = container_of(rcu, struct nft_pipapo_match, rcu);
+ pipapo_free_match(m);
+}
+
+/**
+ * nft_pipapo_commit() - Replace lookup data with current working copy
+ * @set: nftables API set representation
+ *
+ * While at it, check if we should perform garbage collection on the working
+ * copy before committing it for lookup, and don't replace the table if the
+ * working copy doesn't have pending changes.
+ *
+ * We also need to create a new working copy for subsequent insertions and
+ * deletions.
+ */
+static void nft_pipapo_commit(struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *old;
+
+ if (!priv->clone)
+ return;
+
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ pipapo_gc(set, priv->clone);
+
+ old = rcu_replace_pointer(priv->match, priv->clone,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = NULL;
+
+ if (old)
+ call_rcu(&old->rcu, pipapo_reclaim_match);
+}
+
+static void nft_pipapo_abort(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+
+ if (!priv->clone)
+ return;
+ pipapo_free_match(priv->clone);
+ priv->clone = NULL;
+}
+
+/**
+ * nft_pipapo_activate() - Mark element reference as active given key, commit
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem_priv: nftables API element representation containing key data
+ *
+ * On insertion, elements are added to a copy of the matching data currently
+ * in use for lookups, and not directly inserted into current lookup data. Both
+ * nft_pipapo_insert() and nft_pipapo_activate() are called once for each
+ * element, hence we can't purpose either one as a real commit operation.
+ */
+static void nft_pipapo_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
+
+ nft_clear(net, &e->ext);
+}
+
+/**
+ * nft_pipapo_deactivate() - Search for element and make it inactive
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ *
+ * Return: deactivated element if found, NULL otherwise.
+ */
+static struct nft_elem_priv *
+nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
+{
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
+ struct nft_pipapo_elem *e;
+
+ /* removal must occur on priv->clone, if we are low on memory
+ * we have no choice and must fail the removal request.
+ */
+ if (!m)
+ return NULL;
+
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_next(net), nft_net_tstamp(net));
+ if (!e)
+ return NULL;
+
+ nft_set_elem_change_active(net, set, &e->ext);
+
+ return &e->priv;
+}
+
+/**
+ * nft_pipapo_flush() - make element inactive
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem_priv: nftables API element representation containing key data
+ *
+ * This is functionally the same as nft_pipapo_deactivate(), with a slightly
+ * different interface, and it's also called once for each element in a set
+ * being flushed, so we can't implement, strictly speaking, a flush operation,
+ * which would otherwise be as simple as allocating an empty copy of the
+ * matching data.
+ *
+ * Note that we could in theory do that, mark the set as flushed, and ignore
+ * subsequent calls, but we would leak all the elements after the first one,
+ * because they wouldn't then be freed as result of API calls.
+ *
+ * Return: true if element was found and deactivated.
+ */
+static void nft_pipapo_flush(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv);
+
+ nft_set_elem_change_active(net, set, &e->ext);
+}
+
+/**
+ * pipapo_get_boundaries() - Get byte interval for associated rules
+ * @f: Field including lookup table
+ * @first_rule: First rule (lowest index)
+ * @rule_count: Number of associated rules
+ * @left: Byte expression for left boundary (start of range)
+ * @right: Byte expression for right boundary (end of range)
+ *
+ * Given the first rule and amount of rules that originated from the same entry,
+ * build the original range associated with the entry, and calculate the length
+ * of the originating netmask.
+ *
+ * In pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 1,2
+ * 1 1,2
+ * 2 1,2
+ * 3 1,2
+ * 4 1,2
+ * 5 1 2
+ * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ *
+ * this is the lookup table corresponding to the IPv4 range
+ * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks,
+ * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31.
+ *
+ * This function fills @left and @right with the byte values of the leftmost
+ * and rightmost bucket indices for the lowest and highest rule indices,
+ * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in
+ * nibbles:
+ * left: < 12, 0, 10, 8, 0, 1, 0, 0 >
+ * right: < 12, 0, 10, 8, 0, 2, 2, 1 >
+ * corresponding to bytes:
+ * left: < 192, 168, 1, 0 >
+ * right: < 192, 168, 2, 1 >
+ * with mask length irrelevant here, unused on return, as the range is already
+ * defined by its start and end points. The mask length is relevant for a single
+ * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore
+ * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes
+ * < 192, 168, 1, 255 >, and the mask length, calculated from the distances
+ * between leftmost and rightmost bucket indices for each group, would be 24.
+ *
+ * Return: mask length, in bits.
+ */
+static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule,
+ int rule_count, u8 *left, u8 *right)
+{
+ int g, mask_len = 0, bit_offset = 0;
+ u8 *l = left, *r = right;
+
+ for (g = 0; g < f->groups; g++) {
+ int b, x0, x1;
+
+ x0 = -1;
+ x1 = -1;
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
+ unsigned long *pos;
+
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) +
+ (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize;
+ if (test_bit(first_rule, pos) && x0 == -1)
+ x0 = b;
+ if (test_bit(first_rule + rule_count - 1, pos))
+ x1 = b;
+ }
+
+ *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset);
+ *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset);
+
+ bit_offset += f->bb;
+ if (bit_offset >= BITS_PER_BYTE) {
+ bit_offset %= BITS_PER_BYTE;
+ l++;
+ r++;
+ }
+
+ if (x1 - x0 == 0)
+ mask_len += 4;
+ else if (x1 - x0 == 1)
+ mask_len += 3;
+ else if (x1 - x0 == 3)
+ mask_len += 2;
+ else if (x1 - x0 == 7)
+ mask_len += 1;
+ }
+
+ return mask_len;
+}
+
+/**
+ * pipapo_match_field() - Match rules against byte ranges
+ * @f: Field including the lookup table
+ * @first_rule: First of associated rules originating from same entry
+ * @rule_count: Amount of associated rules
+ * @start: Start of range to be matched
+ * @end: End of range to be matched
+ *
+ * Return: true on match, false otherwise.
+ */
+static bool pipapo_match_field(struct nft_pipapo_field *f,
+ int first_rule, int rule_count,
+ const u8 *start, const u8 *end)
+{
+ u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 };
+ u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 };
+
+ pipapo_get_boundaries(f, first_rule, rule_count, left, right);
+
+ return !memcmp(start, left,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) &&
+ !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
+}
+
+/**
+ * nft_pipapo_remove() - Remove element given key, commit
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem_priv: nftables API element representation containing key data
+ *
+ * Similarly to nft_pipapo_activate(), this is used as commit operation by the
+ * API, but it's called once per element in the pending transaction, so we can't
+ * implement this as a single commit operation. Closest we can get is to remove
+ * the matched element here, if any, and commit the updated matching data.
+ */
+static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = priv->clone;
+ unsigned int rules_f0, first_rule = 0;
+ struct nft_pipapo_elem *e;
+ const u8 *data;
+
+ e = nft_elem_priv_cast(elem_priv);
+ data = (const u8 *)nft_set_ext_key(&e->ext);
+
+ while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
+ union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
+ const u8 *match_start, *match_end;
+ struct nft_pipapo_field *f;
+ int i, start, rules_fx;
+
+ match_start = data;
+
+ if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END))
+ match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data;
+ else
+ match_end = data;
+
+ start = first_rule;
+ rules_fx = rules_f0;
+
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1;
+
+ if (!pipapo_match_field(f, start, rules_fx,
+ match_start, match_end))
+ break;
+
+ rulemap[i].to = start;
+ rulemap[i].n = rules_fx;
+
+ rules_fx = f->mt[start].n;
+ start = f->mt[start].to;
+
+ match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+
+ if (last && f->mt[rulemap[i].to].e == e) {
+ pipapo_drop(m, rulemap);
+ return;
+ }
+ }
+
+ first_rule += rules_f0;
+ }
+
+ WARN_ON_ONCE(1); /* elem_priv not found */
+}
+
+/**
+ * nft_pipapo_do_walk() - Walk over elements in m
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
+ * @iter: Iterator
+ *
+ * As elements are referenced in the mapping array for the last field, directly
+ * scan that array: there's no need to follow rule mappings from the first
+ * field. @m is protected either by RCU read lock or by transaction mutex.
+ */
+static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_pipapo_match *m,
+ struct nft_set_iter *iter)
+{
+ const struct nft_pipapo_field *f;
+ unsigned int i, r;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ if (iter->count < iter->skip)
+ goto cont;
+
+ e = f->mt[r].e;
+
+ iter->err = iter->fn(ctx, set, iter, &e->priv);
+ if (iter->err < 0)
+ return;
+
+cont:
+ iter->count++;
+ }
+}
+
+/**
+ * nft_pipapo_walk() - Walk over elements
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @iter: Iterator
+ *
+ * Test if destructive action is needed or not, clone active backend if needed
+ * and call the real function to work on the data.
+ */
+static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ m = pipapo_maybe_clone(set);
+ if (!m) {
+ iter->err = -ENOMEM;
+ return;
+ }
+
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ break;
+ case NFT_ITER_READ:
+ rcu_read_lock();
+ m = rcu_dereference(priv->match);
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ rcu_read_unlock();
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+/**
+ * nft_pipapo_privsize() - Return the size of private data for the set
+ * @nla: netlink attributes, ignored as size doesn't depend on them
+ * @desc: Set description, ignored as size doesn't depend on it
+ *
+ * Return: size of private data for this set implementation, in bytes
+ */
+static u64 nft_pipapo_privsize(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc)
+{
+ return sizeof(struct nft_pipapo);
+}
+
+/**
+ * nft_pipapo_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if set description is compatible, false otherwise
+ */
+static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
+ return false;
+
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
+ return false;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_init() - Initialise data for a set instance
+ * @set: nftables API set representation
+ * @desc: Set description
+ * @nla: netlink attributes
+ *
+ * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink
+ * attributes, initialise internal set parameters, current instance of matching
+ * data and a copy for subsequent insertions.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int nft_pipapo_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
+ const struct nlattr * const nla[])
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ int err, i, field_count;
+
+ BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0);
+
+ field_count = desc->field_count ? : 1;
+
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255);
+ BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT);
+
+ if (field_count > NFT_PIPAPO_MAX_FIELDS)
+ return -EINVAL;
+
+ m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL);
+ if (!m)
+ return -ENOMEM;
+
+ m->field_count = field_count;
+ m->bsize_max = 0;
+
+ m->scratch = alloc_percpu(struct nft_pipapo_scratch *);
+ if (!m->scratch) {
+ err = -ENOMEM;
+ goto out_scratch;
+ }
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(m->scratch, i) = NULL;
+
+ rcu_head_init(&m->rcu);
+
+ nft_pipapo_for_each_field(f, i, m) {
+ unsigned int len = desc->field_len[i] ? : set->klen;
+
+ /* f->groups is u8 */
+ BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES *
+ BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256);
+
+ f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
+ f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
+
+ priv->width += round_up(len, sizeof(u32));
+
+ f->bsize = 0;
+ f->rules = 0;
+ f->rules_alloc = 0;
+ f->lt = NULL;
+ f->mt = NULL;
+ }
+
+ rcu_assign_pointer(priv->match, m);
+
+ return 0;
+
+out_scratch:
+ kfree(m);
+
+ return err;
+}
+
+/**
+ * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array
+ * @ctx: context
+ * @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
+ */
+static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ struct nft_pipapo_match *m)
+{
+ struct nft_pipapo_field *f;
+ unsigned int i, r;
+
+ for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
+ ;
+
+ for (r = 0; r < f->rules; r++) {
+ struct nft_pipapo_elem *e;
+
+ if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e)
+ continue;
+
+ e = f->mt[r].e;
+
+ nf_tables_set_elem_destroy(ctx, set, &e->priv);
+ }
+}
+
+/**
+ * nft_pipapo_destroy() - Free private data for set and all committed elements
+ * @ctx: context
+ * @set: nftables API set representation
+ */
+static void nft_pipapo_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+
+ m = rcu_dereference_protected(priv->match, true);
+
+ if (priv->clone) {
+ nft_set_pipapo_match_destroy(ctx, set, priv->clone);
+ pipapo_free_match(priv->clone);
+ priv->clone = NULL;
+ } else {
+ nft_set_pipapo_match_destroy(ctx, set, m);
+ }
+
+ pipapo_free_match(m);
+}
+
+/**
+ * nft_pipapo_gc_init() - Initialise garbage collection
+ * @set: nftables API set representation
+ *
+ * Instead of actually setting up a periodic work for garbage collection, as
+ * this operation requires a swap of matching data with the working copy, we'll
+ * do that opportunistically with other commit operations if the interval is
+ * elapsed, so we just need to set the current jiffies timestamp here.
+ */
+static void nft_pipapo_gc_init(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+
+ priv->last_gc = jiffies;
+}
+
+const struct nft_set_type nft_set_pipapo_type = {
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+const struct nft_set_type nft_set_pipapo_avx2_type = {
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_avx2_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_avx2_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .commit = nft_pipapo_commit,
+ .abort = nft_pipapo_abort,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+#endif
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
new file mode 100644
index 000000000000..eaab422aa56a
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.h
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef _NFT_SET_PIPAPO_H
+
+#include <linux/log2.h>
+#include <net/ipv6.h> /* For the maximum length of a field */
+
+/* Count of concatenated fields depends on count of 32-bit nftables registers */
+#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
+
+/* Restrict usage to multiple fields, make sure rbtree is used otherwise */
+#define NFT_PIPAPO_MIN_FIELDS 2
+
+/* Largest supported field size */
+#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
+#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
+
+/* Bits to be grouped together in table buckets depending on set size */
+#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET
+#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8
+#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4
+#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \
+ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \
+ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4))
+#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb)
+
+/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the
+ * small group width, and switch to the big group width if the table gets
+ * smaller than NFT_PIPAPO_LT_SIZE_LOW.
+ *
+ * Picking 2MiB as threshold (for a single table) avoids as much as possible
+ * crossing page boundaries on most architectures (x86-64 and MIPS huge pages,
+ * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which
+ * keeps performance nice in case kvmalloc() gives us non-contiguous areas.
+ */
+#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21)
+#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16)
+#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD
+#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \
+ NFT_PIPAPO_LT_SIZE_HYSTERESIS
+
+/* Fields are padded to 32 bits in input registers */
+#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \
+ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32)))
+#define NFT_PIPAPO_GROUPS_PADDING(f) \
+ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \
+ NFT_PIPAPO_GROUPS_PER_BYTE(f))
+
+/* Number of buckets given by 2 ^ n, with n bucket bits */
+#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb))
+
+/* Each n-bit range maps to up to n * 2 rules */
+#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
+
+/* Use the rest of mapping table buckets for rule indices, but it makes no sense
+ * to exceed 32 bits
+ */
+#if BITS_PER_LONG == 64
+#define NFT_PIPAPO_MAP_TOBITS 32
+#else
+#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
+#endif
+
+/* ...which gives us the highest allowed index for a rule */
+#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
+ - (1UL << NFT_PIPAPO_MAP_NBITS))
+
+/* Definitions for vectorised implementations */
+#ifdef NFT_PIPAPO_ALIGN
+#define NFT_PIPAPO_ALIGN_HEADROOM \
+ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
+#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
+#else
+#define NFT_PIPAPO_ALIGN_HEADROOM 0
+#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
+#endif /* NFT_PIPAPO_ALIGN */
+
+#define nft_pipapo_for_each_field(field, index, match) \
+ for ((field) = (match)->f, (index) = 0; \
+ (index) < (match)->field_count; \
+ (index)++, (field)++)
+
+/**
+ * union nft_pipapo_map_bucket - Bucket of mapping table
+ * @to: First rule number (in next field) this rule maps to
+ * @n: Number of rules (in next field) this rule maps to
+ * @e: If there's no next field, pointer to element this rule maps to
+ */
+union nft_pipapo_map_bucket {
+ struct {
+#if BITS_PER_LONG == 64
+ static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
+ u32 to;
+
+ static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
+ u32 n;
+#else
+ unsigned long to:NFT_PIPAPO_MAP_TOBITS;
+ unsigned long n:NFT_PIPAPO_MAP_NBITS;
+#endif
+ };
+ struct nft_pipapo_elem *e;
+};
+
+/**
+ * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
+ * @rules: Number of inserted rules
+ * @bsize: Size of each bucket in lookup table, in longs
+ * @rules_alloc: Number of allocated rules, always >= rules
+ * @groups: Amount of bit groups
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @lt: Lookup table: 'groups' rows of buckets
+ * @mt: Mapping table: one bucket per rule
+ */
+struct nft_pipapo_field {
+ unsigned int rules;
+ unsigned int bsize;
+ unsigned int rules_alloc;
+ u8 groups;
+ u8 bb;
+ unsigned long *lt;
+ union nft_pipapo_map_bucket *mt;
+};
+
+/**
+ * struct nft_pipapo_scratch - percpu data used for lookup and matching
+ * @bh_lock: PREEMPT_RT local spinlock
+ * @map_index: Current working bitmap index, toggled between field matches
+ * @__map: store partial matching results during lookup
+ */
+struct nft_pipapo_scratch {
+ local_lock_t bh_lock;
+ u8 map_index;
+ unsigned long __map[];
+};
+
+/**
+ * struct nft_pipapo_match - Data used for lookup and matching
+ * @field_count: Amount of fields in set
+ * @bsize_max: Maximum lookup table bucket size of all fields, in longs
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @rcu: Matching data is swapped on commits
+ * @f: Fields, with lookup and mapping tables
+ */
+struct nft_pipapo_match {
+ u8 field_count;
+ unsigned int bsize_max;
+ struct nft_pipapo_scratch * __percpu *scratch;
+ struct rcu_head rcu;
+ struct nft_pipapo_field f[] __counted_by(field_count);
+};
+
+/**
+ * struct nft_pipapo - Representation of a set
+ * @match: Currently in-use matching data
+ * @clone: Copy where pending insertions and deletions are kept
+ * @width: Total bytes to be matched for one packet, including padding
+ * @last_gc: Timestamp of last garbage collection run, jiffies
+ */
+struct nft_pipapo {
+ struct nft_pipapo_match __rcu *match;
+ struct nft_pipapo_match *clone;
+ int width;
+ unsigned long last_gc;
+};
+
+struct nft_pipapo_elem;
+
+/**
+ * struct nft_pipapo_elem - API-facing representation of single set element
+ * @priv: element placeholder
+ * @ext: nftables API extensions
+ */
+struct nft_pipapo_elem {
+ struct nft_elem_priv priv;
+ struct nft_set_ext ext;
+};
+
+int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
+ unsigned long *dst,
+ const union nft_pipapo_map_bucket *mt, bool match_only);
+
+/**
+ * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) {
+ u8 v;
+
+ v = *data >> 4;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+
+ v = *data & 0x0f;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+ }
+}
+
+/**
+ * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group++, data++) {
+ __bitmap_and(dst, dst, lt + *data * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(8);
+ }
+}
+
+/**
+ * pipapo_estimate_size() - Estimate worst-case for set size
+ * @desc: Set description, element count and field description used here
+ *
+ * The size for this set type can vary dramatically, as it depends on the number
+ * of rules (composing netmasks) the entries expand to. We compute the worst
+ * case here.
+ *
+ * In general, for a non-ranged entry or a single composing netmask, we need
+ * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
+ * is, each input bit needs four bits of matching data), plus a bucket in the
+ * mapping table for each field.
+ *
+ * Return: worst-case set size in bytes, 0 on any overflow
+ */
+static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
+{
+ unsigned long entry_size;
+ u64 size;
+ int i;
+
+ for (i = 0, entry_size = 0; i < desc->field_count; i++) {
+ unsigned long rules;
+
+ if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
+ return 0;
+
+ /* Worst-case ranges for each concatenated field: each n-bit
+ * field can expand to up to n * 2 rules in each bucket, and
+ * each rule also needs a mapping bucket.
+ */
+ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
+ entry_size += rules *
+ NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) /
+ BITS_PER_BYTE;
+ entry_size += rules * sizeof(union nft_pipapo_map_bucket);
+ }
+
+ /* Rules in lookup and mapping tables are needed for each entry */
+ size = desc->size * entry_size;
+ if (size && div_u64(size, desc->size) != entry_size)
+ return 0;
+
+ size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2;
+
+ size += sizeof(struct nft_pipapo_field) * desc->field_count;
+
+ return size;
+}
+
+/**
+ * pipapo_resmap_init() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Initialize all bits covered by the first field to one, so that after
+ * the first step, only the matching bits of the first bit group remain.
+ *
+ * If other fields have a large bitmap, set remainder of res_map to 0.
+ */
+static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ for (i = 0; i < f->bsize; i++)
+ res_map[i] = ULONG_MAX;
+
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
+#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
new file mode 100644
index 000000000000..7ff90325c97f
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -0,0 +1,1302 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
+
+/* Load from memory into YMM register with non-temporal hint ("stream load"),
+ * that is, don't fetch lines from memory into the cache. This avoids pushing
+ * precious packet data out of the cache hierarchy, and is appropriate when:
+ *
+ * - loading buckets from lookup tables, as they are not going to be used
+ * again before packets are entirely classified
+ *
+ * - loading the result bitmap from the previous field, as it's never used
+ * again
+ */
+#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
+ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
+
+/* Stream a single lookup table bucket into YMM register given lookup table,
+ * group index, value of packet bits, bucket size.
+ */
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
+ (v)) * (bsize)])
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
+ (v)) * (bsize)])
+
+/* Bitwise AND: the staple operation of this algorithm */
+#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
+ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
+
+/* Jump to label if @reg is zero */
+#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
+ asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ "je %l[" #label "]" : : : : label)
+
+/* Store 256 bits from YMM register into memory. Contrary to bucket load
+ * operation, we don't bypass the cache here, as stored matching results
+ * are always used shortly after.
+ */
+#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
+
+/* Zero out a complete YMM register, @reg */
+#define NFT_PIPAPO_AVX2_ZERO(reg) \
+ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
+
+/**
+ * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
+ *
+ * This zeroes out ymm15, which is later used whenever we need to clear a
+ * memory location, by storing its content into memory.
+ */
+static void nft_pipapo_avx2_prepare(void)
+{
+ NFT_PIPAPO_AVX2_ZERO(15);
+}
+
+/**
+ * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
+ * @data: Base memory area
+ * @start: First bit to set
+ * @len: Count of bits to fill
+ *
+ * This is nothing else than a version of bitmap_set(), as used e.g. by
+ * pipapo_refill(), tailored for the microarchitectures using it and better
+ * suited for the specific usage: it's very likely that we'll set a small number
+ * of bits, not crossing a word boundary, and correct branch prediction is
+ * critical here.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ */
+static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
+{
+ int offset = start % BITS_PER_LONG;
+ unsigned long mask;
+
+ data += start / BITS_PER_LONG;
+
+ if (likely(len == 1)) {
+ *data |= BIT(offset);
+ return;
+ }
+
+ if (likely(len < BITS_PER_LONG || offset)) {
+ if (likely(len + offset <= BITS_PER_LONG)) {
+ *data |= GENMASK(len - 1 + offset, offset);
+ return;
+ }
+
+ *data |= ~0UL << offset;
+ len -= BITS_PER_LONG - offset;
+ data++;
+
+ if (len <= BITS_PER_LONG) {
+ mask = ~0UL >> (BITS_PER_LONG - len);
+ *data |= mask;
+ return;
+ }
+ }
+
+ memset(data, 0xff, len / BITS_PER_BYTE);
+ data += len / BITS_PER_LONG;
+
+ len %= BITS_PER_LONG;
+ if (len)
+ *data |= ~0UL >> (BITS_PER_LONG - len);
+}
+
+/**
+ * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
+ * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
+ * @map: Bitmap to be scanned for set bits
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @last: Return index of first set bit, if this is the last field
+ *
+ * This is an alternative implementation of pipapo_refill() suitable for usage
+ * with AVX2 lookup routines: we know there are four words to be scanned, at
+ * a given offset inside the map, for each matching iteration.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ *
+ * Return: first set bit index if @last, index of first filled word otherwise.
+ */
+static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
+ unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool last)
+{
+ int ret = -1;
+
+#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
+ do { \
+ while (map[(x)]) { \
+ int r = __builtin_ctzl(map[(x)]); \
+ int i = (offset + (x)) * BITS_PER_LONG + r; \
+ \
+ if (last) \
+ return i; \
+ \
+ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
+ \
+ if (ret == -1) \
+ ret = mt[i].to; \
+ \
+ map[(x)] &= ~(1UL << r); \
+ } \
+ } while (0)
+
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
+#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
+ * working bitmap, @fill.
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Out-of-order (and superscalar) execution is vital here, so it's critical to
+ * avoid false data dependencies. CPU and compiler could (mostly) take care of
+ * this on their own, but the operation ordering is explicitly given here with
+ * a likely execution order in mind, to highlight possible stalls. That's why
+ * a number of logically distinct operations (i.e. loading buckets, intersecting
+ * buckets) are interleaved.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 6, 7);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_AND(9, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 6, 7);
+ NFT_PIPAPO_AVX2_AND(12, 8, 9);
+ NFT_PIPAPO_AVX2_AND(13, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 6, 7);
+ NFT_PIPAPO_AVX2_AND(13, 8, 9);
+ NFT_PIPAPO_AVX2_AND(14, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ NFT_PIPAPO_AVX2_AND(1, 1, 14);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 9, 10);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(8, 6, 7);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
+ pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
+ pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
+ pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
+ pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(14, 9, 10);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 8, 9);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 12, 13);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 14, 0);
+ NFT_PIPAPO_AVX2_AND(7, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 9, 10);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ NFT_PIPAPO_AVX2_AND(1, 8, 9);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_AND(3, 12, 0);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(4, 1, 2);
+ NFT_PIPAPO_AVX2_AND(5, 3, 4);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_AND(2, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(4, 3, 2);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(1, 6, 7);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
+ NFT_PIPAPO_AVX2_AND(3, 5, 6);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(3, 4, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 3, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(6, 4, 5);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @mdata: Matching data, including mapping table
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * This function should never be called, but is provided for the case the field
+ * size doesn't match any of the known data types. Matching rate is
+ * substantially lower than AVX2 routines.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata,
+ unsigned long *map, unsigned long *fill,
+ const struct nft_pipapo_field *f,
+ int offset, const u8 *pkt,
+ bool first, bool last)
+{
+ unsigned long bsize = f->bsize;
+ int i, ret = -1, b;
+
+ if (first)
+ pipapo_resmap_init(mdata, map);
+
+ for (i = offset; i < bsize; i++) {
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, map, pkt);
+ else
+ pipapo_and_field_buckets_4bit(f, map, pkt);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
+
+ if (last)
+ return b;
+
+ if (ret == -1)
+ ret = b / XSAVE_YMM_SIZE;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if set is compatible and AVX2 available, false otherwise.
+ */
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2))
+ return false;
+
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
+ return false;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * pipapo_resmap_init_avx2() - Initialise result map before first use
+ * @m: Matching data, including mapping table
+ * @res_map: Result map
+ *
+ * Like pipapo_resmap_init() but do not set start map bits covered by the first field.
+ */
+static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map)
+{
+ const struct nft_pipapo_field *f = m->f;
+ int i;
+
+ /* Starting map doesn't need to be set to all-ones for this implementation,
+ * but we do need to zero the remaining bits, if any.
+ */
+ for (i = f->bsize; i < m->bsize_max; i++)
+ res_map[i] = 0ul;
+}
+
+/**
+ * pipapo_get_avx2() - Lookup function for AVX2 implementation
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
+ *
+ * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
+ *
+ * This implementation exploits the repetitive characteristic of the algorithm
+ * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
+ *
+ * The caller must check that the FPU is usable.
+ * This function must be called with BH disabled.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
+ */
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
+{
+ struct nft_pipapo_scratch *scratch;
+ const struct nft_pipapo_field *f;
+ unsigned long *res, *fill, *map;
+ bool map_index;
+ int i;
+
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ return NULL;
+
+ __local_lock_nested_bh(&scratch->bh_lock);
+ map_index = scratch->map_index;
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res = map + (map_index ? m->bsize_max : 0);
+ fill = map + (map_index ? 0 : m->bsize_max);
+
+ pipapo_resmap_init_avx2(m, res);
+
+ /* Note that we don't need a valid MXCSR state for any of the
+ * operations we use here, so pass 0 as mask and spare a LDMXCSR
+ * instruction.
+ */
+ kernel_fpu_begin_mask(0);
+
+ nft_pipapo_avx2_prepare();
+
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1, first = !i;
+ int ret = 0;
+
+#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
+ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
+ ret, data, \
+ first, last))
+
+ if (likely(f->bb == 8)) {
+ if (f->groups == 1) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
+ } else if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
+ } else if (f->groups == 6) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
+ } else if (f->groups == 16) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
+ first, last);
+ }
+ } else {
+ if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
+ } else if (f->groups == 8) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
+ } else if (f->groups == 12) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
+ } else if (f->groups == 32) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
+ ret, data,
+ first, last);
+ }
+ }
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+#undef NFT_SET_PIPAPO_AVX2_LOOKUP
+
+next_match:
+ if (ret < 0) {
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+ }
+
+ if (last) {
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[ret].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask))) {
+ ret = pipapo_refill(res, f->bsize, f->rules,
+ fill, f->mt, last);
+ goto next_match;
+ }
+
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return e;
+ }
+
+ map_index = !map_index;
+ swap(res, fill);
+ data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy using
+ * the AVX2 routines if the FPU is usable or fall back to the generic
+ * implementation of the algorithm otherwise.
+ *
+ * Return: nftables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const u8 *rp = (const u8 *)key;
+ const struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+ if (unlikely(!irq_fpu_usable())) {
+ const struct nft_set_ext *ext;
+
+ ext = nft_pipapo_lookup(net, set, key);
+
+ local_bh_enable();
+ return ext;
+ }
+
+ m = rcu_dereference(priv->match);
+
+ e = pipapo_get_avx2(m, rp, 0, get_jiffies_64());
+ local_bh_enable();
+
+ return e ? &e->ext : NULL;
+}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
new file mode 100644
index 000000000000..c2999b63da3f
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NFT_SET_PIPAPO_AVX2_H
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+#include <asm/fpu/xstate.h>
+#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+
+struct nft_pipapo_match;
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est);
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp);
+#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
+
+#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index fa61208371f8..ca594161b840 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -16,16 +13,17 @@
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
- seqcount_t count;
- struct delayed_work gc_work;
+ seqcount_rwlock_t count;
+ unsigned long last_gc;
};
struct nft_rbtree_elem {
+ struct nft_elem_priv priv;
struct rb_node node;
struct nft_set_ext ext;
};
@@ -36,40 +34,52 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
}
-static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
- const struct nft_rbtree_elem *interval)
+static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
+{
+ return !nft_rbtree_interval_end(rbe);
+}
+
+static int nft_rbtree_cmp(const struct nft_set *set,
+ const struct nft_rbtree_elem *e1,
+ const struct nft_rbtree_elem *e2)
+{
+ return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
+ set->klen);
+}
+
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
{
- return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
+ return nft_set_elem_expired(&rbe->ext);
}
-static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext,
- unsigned int seq)
+static const struct nft_set_ext *
+__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, unsigned int seq)
{
struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
- const void *this;
int d;
parent = rcu_dereference_raw(priv->root.rb_node);
while (parent != NULL) {
if (read_seqcount_retry(&priv->count, seq))
- return false;
+ return NULL;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- this = nft_set_ext_key(&rbe->ext);
- d = memcmp(this, key, set->klen);
+ d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
if (d < 0) {
parent = rcu_dereference_raw(parent->rb_left);
if (interval &&
- nft_rbtree_equal(set, this, interval) &&
+ !nft_rbtree_cmp(set, rbe, interval) &&
nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(interval))
+ nft_rbtree_interval_start(interval))
continue;
- interval = rbe;
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_rbtree_elem_expired(rbe))
+ interval = rbe;
} else if (d > 0)
parent = rcu_dereference_raw(parent->rb_right);
else {
@@ -77,41 +87,48 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
parent = rcu_dereference_raw(parent->rb_left);
continue;
}
- if (nft_rbtree_interval_end(rbe))
- goto out;
- *ext = &rbe->ext;
- return true;
+ if (nft_rbtree_elem_expired(rbe))
+ return NULL;
+
+ if (nft_rbtree_interval_end(rbe)) {
+ if (nft_set_is_anonymous(set))
+ return NULL;
+ parent = rcu_dereference_raw(parent->rb_left);
+ interval = NULL;
+ continue;
+ }
+
+ return &rbe->ext;
}
}
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_interval_end(interval)) {
- *ext = &interval->ext;
- return true;
- }
-out:
- return false;
+ nft_rbtree_interval_start(interval))
+ return &interval->ext;
+
+ return NULL;
}
-static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+INDIRECT_CALLABLE_SCOPE
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
- bool ret;
+ const struct nft_set_ext *ext;
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
- if (ret || !read_seqcount_retry(&priv->count, seq))
- return ret;
+ ext = __nft_rbtree_lookup(net, set, key, seq);
+ if (ext || !read_seqcount_retry(&priv->count, seq))
+ return ext;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
+ ext = __nft_rbtree_lookup(net, set, key, seq);
read_unlock_bh(&priv->lock);
- return ret;
+ return ext;
}
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
@@ -142,8 +159,13 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
if (flags & NFT_SET_ELEM_INTERVAL_END)
interval = rbe;
} else {
- if (!nft_set_elem_active(&rbe->ext, genmask))
+ if (!nft_set_elem_active(&rbe->ext, genmask)) {
parent = rcu_dereference_raw(parent->rb_left);
+ continue;
+ }
+
+ if (nft_set_elem_expired(&rbe->ext))
+ return false;
if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
@@ -151,12 +173,17 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
*elem = rbe;
return true;
}
- return false;
+
+ if (nft_rbtree_interval_end(rbe))
+ interval = NULL;
+
+ parent = rcu_dereference_raw(parent->rb_left);
}
}
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
+ !nft_set_elem_expired(&interval->ext) &&
((!nft_rbtree_interval_end(interval) &&
!(flags & NFT_SET_ELEM_INTERVAL_END)) ||
(nft_rbtree_interval_end(interval) &&
@@ -168,8 +195,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
return false;
}
-static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static struct nft_elem_priv *
+nft_rbtree_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
@@ -180,55 +208,274 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
if (ret || !read_seqcount_retry(&priv->count, seq))
- return rbe;
+ return &rbe->priv;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
- if (!ret)
- rbe = ERR_PTR(-ENOENT);
read_unlock_bh(&priv->lock);
- return rbe;
+ if (!ret)
+ return ERR_PTR(-ENOENT);
+
+ return &rbe->priv;
+}
+
+static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ lockdep_assert_held_write(&priv->lock);
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ rb_erase(&rbe->node, &priv->root);
+}
+
+static const struct nft_rbtree_elem *
+nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ struct nft_set *set = (struct nft_set *)__set;
+ struct rb_node *prev = rb_prev(&rbe->node);
+ struct net *net = read_pnet(&set->net);
+ struct nft_rbtree_elem *rbe_prev;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+ if (!gc)
+ return ERR_PTR(-ENOMEM);
+
+ /* search for end interval coming before this element.
+ * end intervals don't carry a timeout extension, they
+ * are coupled with the interval start element.
+ */
+ while (prev) {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ if (nft_rbtree_interval_end(rbe_prev) &&
+ nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
+ break;
+
+ prev = rb_prev(prev);
+ }
+
+ rbe_prev = NULL;
+ if (prev) {
+ rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
+
+ /* There is always room in this trans gc for this element,
+ * memory allocation never actually happens, hence, the warning
+ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+ * this is synchronous gc which never fails.
+ */
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
+
+ nft_trans_gc_elem_add(gc, rbe_prev);
+ }
+
+ nft_rbtree_gc_elem_remove(net, set, priv, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return ERR_PTR(-ENOMEM);
+
+ nft_trans_gc_elem_add(gc, rbe);
+
+ nft_trans_gc_queue_sync_done(gc);
+
+ return rbe_prev;
+}
+
+static bool nft_rbtree_update_first(const struct nft_set *set,
+ struct nft_rbtree_elem *rbe,
+ struct rb_node *first)
+{
+ struct nft_rbtree_elem *first_elem;
+
+ first_elem = rb_entry(first, struct nft_rbtree_elem, node);
+ /* this element is closest to where the new element is to be inserted:
+ * update the first element for the node list path.
+ */
+ if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
+ return true;
+
+ return false;
}
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
+ struct rb_node *node, *next, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
+ u8 cur_genmask = nft_genmask_cur(net);
u8 genmask = nft_genmask_next(net);
- struct nft_rbtree_elem *rbe;
- struct rb_node *parent, **p;
+ u64 tstamp = nft_net_tstamp(net);
int d;
+ /* Descend the tree to search for an existing element greater than the
+ * key value to insert that is greater than the new element. This is the
+ * first element to walk the ordered elements to find possible overlap.
+ */
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
parent = *p;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
- d = memcmp(nft_set_ext_key(&rbe->ext),
- nft_set_ext_key(&new->ext),
- set->klen);
- if (d < 0)
+ d = nft_rbtree_cmp(set, rbe, new);
+
+ if (d < 0) {
p = &parent->rb_left;
- else if (d > 0)
+ } else if (d > 0) {
+ if (!first ||
+ nft_rbtree_update_first(set, rbe, first))
+ first = &rbe->node;
+
p = &parent->rb_right;
- else {
- if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(new)) {
+ } else {
+ if (nft_rbtree_interval_end(rbe))
p = &parent->rb_left;
- } else if (!nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_end(new)) {
+ else
p = &parent->rb_right;
- } else if (nft_set_elem_active(&rbe->ext, genmask)) {
- *ext = &rbe->ext;
- return -EEXIST;
- } else {
- p = &parent->rb_left;
+ }
+ }
+
+ if (!first)
+ first = rb_first(&priv->root);
+
+ /* Detect overlap by going through the list of valid tree nodes.
+ * Values stored in the tree are in reversed order, starting from
+ * highest to lowest value.
+ */
+ for (node = first; node != NULL; node = next) {
+ next = rb_next(node);
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ if (!nft_set_elem_active(&rbe->ext, genmask))
+ continue;
+
+ /* perform garbage collection to avoid bogus overlap reports
+ * but skip new elements in this transaction.
+ */
+ if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
+ nft_set_elem_active(&rbe->ext, cur_genmask)) {
+ const struct nft_rbtree_elem *removed_end;
+
+ removed_end = nft_rbtree_gc_elem(set, priv, rbe);
+ if (IS_ERR(removed_end))
+ return PTR_ERR(removed_end);
+
+ if (removed_end == rbe_le || removed_end == rbe_ge)
+ return -EAGAIN;
+
+ continue;
+ }
+
+ d = nft_rbtree_cmp(set, rbe, new);
+ if (d == 0) {
+ /* Matching end element: no need to look for an
+ * overlapping greater or equal element.
+ */
+ if (nft_rbtree_interval_end(rbe)) {
+ rbe_le = rbe;
+ break;
+ }
+
+ /* first element that is greater or equal to key value. */
+ if (!rbe_ge) {
+ rbe_ge = rbe;
+ continue;
+ }
+
+ /* this is a closer more or equal element, update it. */
+ if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
+ rbe_ge = rbe;
+ continue;
}
+
+ /* element is equal to key value, make sure flags are
+ * the same, an existing more or equal start element
+ * must not be replaced by more or equal end element.
+ */
+ if ((nft_rbtree_interval_start(new) &&
+ nft_rbtree_interval_start(rbe_ge)) ||
+ (nft_rbtree_interval_end(new) &&
+ nft_rbtree_interval_end(rbe_ge))) {
+ rbe_ge = rbe;
+ continue;
+ }
+ } else if (d > 0) {
+ /* annotate element greater than the new element. */
+ rbe_ge = rbe;
+ continue;
+ } else if (d < 0) {
+ /* annotate element less than the new element. */
+ rbe_le = rbe;
+ break;
}
}
+
+ /* - new start element matching existing start element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
+ nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
+ *elem_priv = &rbe_ge->priv;
+ return -EEXIST;
+ }
+
+ /* - new end element matching existing end element: full overlap
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
+ */
+ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
+ nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
+ *elem_priv = &rbe_le->priv;
+ return -EEXIST;
+ }
+
+ /* - new start element with existing closest, less or equal key value
+ * being a start element: partial overlap, reported as -ENOTEMPTY.
+ * Anonymous sets allow for two consecutive start element since they
+ * are constant, skip them to avoid bogus overlap reports.
+ */
+ if (!nft_set_is_anonymous(set) && rbe_le &&
+ nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
+ return -ENOTEMPTY;
+
+ /* - new end element with existing closest, less or equal key value
+ * being a end element: partial overlap, reported as -ENOTEMPTY.
+ */
+ if (rbe_le &&
+ nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
+ return -ENOTEMPTY;
+
+ /* - new end element with existing closest, greater or equal key value
+ * being an end element: partial overlap, reported as -ENOTEMPTY
+ */
+ if (rbe_ge &&
+ nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
+ return -ENOTEMPTY;
+
+ /* Accepted element: pick insertion point depending on key value */
+ parent = NULL;
+ p = &priv->root.rb_node;
+ while (*p != NULL) {
+ parent = *p;
+ rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ d = nft_rbtree_cmp(set, rbe, new);
+
+ if (d < 0)
+ p = &parent->rb_left;
+ else if (d > 0)
+ p = &parent->rb_right;
+ else if (nft_rbtree_interval_end(rbe))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -236,66 +483,74 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
- struct nft_set_ext **ext)
+ struct nft_elem_priv **elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
int err;
+ do {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ cond_resched();
+
+ write_lock_bh(&priv->lock);
+ write_seqcount_begin(&priv->count);
+ err = __nft_rbtree_insert(net, set, rbe, elem_priv);
+ write_seqcount_end(&priv->count);
+ write_unlock_bh(&priv->lock);
+ } while (err == -EAGAIN);
+
+ return err;
+}
+
+static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
+{
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
- err = __nft_rbtree_insert(net, set, rbe, ext);
+ rb_erase(&rbe->node, &priv->root);
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
-
- return err;
}
static void nft_rbtree_remove(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe = elem->priv;
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- rb_erase(&rbe->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
+ nft_rbtree_erase(priv, rbe);
}
static void nft_rbtree_activate(const struct net *net,
const struct nft_set *set,
- const struct nft_set_elem *elem)
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = elem->priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- nft_set_elem_change_active(net, set, &rbe->ext);
- nft_set_elem_clear_busy(&rbe->ext);
+ nft_clear(net, &rbe->ext);
}
-static bool nft_rbtree_flush(const struct net *net,
- const struct nft_set *set, void *priv)
+static void nft_rbtree_flush(const struct net *net,
+ const struct nft_set *set,
+ struct nft_elem_priv *elem_priv)
{
- struct nft_rbtree_elem *rbe = priv;
+ struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
- if (!nft_set_elem_mark_busy(&rbe->ext) ||
- !nft_is_active(net, &rbe->ext)) {
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &rbe->ext);
}
-static void *nft_rbtree_deactivate(const struct net *net,
- const struct nft_set *set,
- const struct nft_set_elem *elem)
+static struct nft_elem_priv *
+nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
- struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_next(net);
+ u64 tstamp = nft_net_tstamp(net);
int d;
while (parent != NULL) {
@@ -308,113 +563,140 @@ static void *nft_rbtree_deactivate(const struct net *net,
else if (d > 0)
parent = parent->rb_right;
else {
- if (!nft_set_elem_active(&rbe->ext, genmask)) {
- parent = parent->rb_left;
- continue;
- }
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(this)) {
+ nft_rbtree_interval_start(this)) {
parent = parent->rb_left;
continue;
- } else if (!nft_rbtree_interval_end(rbe) &&
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
+ } else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
+ break;
+ } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
+ parent = parent->rb_left;
+ continue;
}
- nft_rbtree_flush(net, set, rbe);
- return rbe;
+ nft_rbtree_flush(net, set, &rbe->priv);
+ return &rbe->priv;
}
}
return NULL;
}
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
- struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
- struct nft_set_elem elem;
struct rb_node *node;
- read_lock_bh(&priv->lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (iter->count < iter->skip)
goto cont;
- if (!nft_set_elem_active(&rbe->ext, iter->genmask))
- goto cont;
-
- elem.priv = rbe;
- iter->err = iter->fn(ctx, set, iter, &elem);
- if (iter->err < 0) {
- read_unlock_bh(&priv->lock);
+ iter->err = iter->fn(ctx, set, iter, &rbe->priv);
+ if (iter->err < 0)
return;
- }
cont:
iter->count++;
}
- read_unlock_bh(&priv->lock);
}
-static void nft_rbtree_gc(struct work_struct *work)
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
- struct nft_set_gc_batch *gcb = NULL;
- struct nft_rbtree *priv;
- struct rb_node *node;
- struct nft_set *set;
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+ nft_rbtree_do_walk(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ read_lock_bh(&priv->lock);
+ nft_rbtree_do_walk(ctx, set, iter);
+ read_unlock_bh(&priv->lock);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ nft_setelem_data_deactivate(net, set, &rbe->priv);
+ nft_rbtree_erase(priv, rbe);
+}
+
+static void nft_rbtree_gc(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+ struct net *net = read_pnet(&set->net);
+ u64 tstamp = nft_net_tstamp(net);
+ struct rb_node *node, *next;
+ struct nft_trans_gc *gc;
- priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ for (node = rb_first(&priv->root); node ; node = next) {
+ next = rb_next(node);
- write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ /* elements are reversed in the rbtree for historical reasons,
+ * from highest to lowest value, that is why end element is
+ * always visited before the start element.
+ */
if (nft_rbtree_interval_end(rbe)) {
rbe_end = rbe;
continue;
}
- if (!nft_set_elem_expired(&rbe->ext))
- continue;
- if (nft_set_elem_mark_busy(&rbe->ext))
+ if (!__nft_set_elem_expired(&rbe->ext, tstamp))
continue;
- if (rbe_prev) {
- rb_erase(&rbe_prev->node, &priv->root);
- rbe_prev = NULL;
- }
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (!gcb)
- break;
-
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- rbe_prev = rbe;
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+ /* end element needs to be removed first, it has
+ * no timeout extension.
+ */
if (rbe_end) {
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_end);
- rb_erase(&rbe_end->node, &priv->root);
+ nft_rbtree_gc_remove(net, set, priv, rbe_end);
+ nft_trans_gc_elem_add(gc, rbe_end);
rbe_end = NULL;
}
- node = rb_next(node);
- if (!node)
- break;
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ goto try_later;
+
+ nft_rbtree_gc_remove(net, set, priv, rbe);
+ nft_trans_gc_elem_add(gc, rbe);
}
- if (rbe_prev)
- rb_erase(&rbe_prev->node, &priv->root);
- write_seqcount_end(&priv->count);
- write_unlock_bh(&priv->lock);
- nft_set_gc_batch_complete(gcb);
+try_later:
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
+ if (gc) {
+ gc = nft_trans_gc_catchall_sync(gc);
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -429,36 +711,35 @@ static int nft_rbtree_init(const struct nft_set *set,
{
struct nft_rbtree *priv = nft_set_priv(set);
+ BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
+
rwlock_init(&priv->lock);
- seqcount_init(&priv->count);
+ seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
- INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
- if (set->flags & NFT_SET_TIMEOUT)
- queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
- nft_set_gc_interval(set));
-
return 0;
}
-static void nft_rbtree_destroy(const struct nft_set *set)
+static void nft_rbtree_destroy(const struct nft_ctx *ctx,
+ const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct rb_node *node;
- cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- nft_set_elem_destroy(set, rbe, true);
+ nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
}
}
static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
+ if (desc->field_count > 1)
+ return false;
+
if (desc->size)
est->size = sizeof(struct nft_rbtree) +
desc->size * sizeof(struct nft_rbtree_elem);
@@ -471,8 +752,62 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_rbtree_type __read_mostly = {
- .owner = THIS_MODULE,
+static void nft_rbtree_commit(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ nft_rbtree_gc(set);
+}
+
+static void nft_rbtree_gc_init(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ priv->last_gc = jiffies;
+}
+
+/* rbtree stores ranges as singleton elements, each range is composed of two
+ * elements ...
+ */
+static u32 nft_rbtree_ksize(u32 size)
+{
+ return size * 2;
+}
+
+/* ... hide this detail to userspace. */
+static u32 nft_rbtree_usize(u32 size)
+{
+ if (!size)
+ return 0;
+
+ return size / 2;
+}
+
+static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe;
+ struct rb_node *node;
+ const void *key;
+
+ node = rb_last(&priv->root);
+ if (!node)
+ return 0;
+
+ rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_rbtree_interval_end(rbe))
+ return 0;
+
+ key = nft_set_ext_key(&rbe->ext);
+ if (memchr(key, 1, set->klen))
+ return 0;
+
+ /* this is the all-zero no-match element. */
+ return 1;
+}
+
+const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
.privsize = nft_rbtree_privsize,
@@ -485,8 +820,13 @@ struct nft_set_type nft_set_rbtree_type __read_mostly = {
.deactivate = nft_rbtree_deactivate,
.flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
+ .commit = nft_rbtree_commit,
+ .gc_init = nft_rbtree_gc_init,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
.get = nft_rbtree_get,
+ .ksize = nft_rbtree_ksize,
+ .usize = nft_rbtree_usize,
+ .adjust_maxsize = nft_rbtree_adjust_maxsize,
},
};
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index d7f3776dfd71..36affbb697c2 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -9,11 +9,101 @@
struct nft_socket {
enum nft_socket_keys key:8;
+ u8 level; /* cgroupv2 level to extract */
+ u8 level_user; /* cgroupv2 level provided by userspace */
+ u8 len;
union {
- enum nft_registers dreg:8;
+ u8 dreg;
};
};
+static void nft_socket_wildcard(const struct nft_pktinfo *pkt,
+ struct nft_regs *regs, struct sock *sk,
+ u32 *dest)
+{
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr));
+ break;
+#endif
+ default:
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+}
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static noinline bool
+nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
+{
+ struct cgroup *cgrp;
+ u64 cgid;
+
+ if (!sk_fullsock(sk))
+ return false;
+
+ cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
+ if (!cgrp)
+ return false;
+
+ cgid = cgroup_id(cgrp);
+ memcpy(dest, &cgid, sizeof(u64));
+ return true;
+}
+
+/* process context only, uses current->nsproxy. */
+static noinline int nft_socket_cgroup_subtree_level(void)
+{
+ struct cgroup *cgrp = cgroup_get_from_path("/");
+ int level;
+
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ level = cgrp->level;
+
+ cgroup_put(cgrp);
+
+ if (level > 255)
+ return -ERANGE;
+
+ if (WARN_ON_ONCE(level < 0))
+ return -EINVAL;
+
+ return level;
+}
+#endif
+
+static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
+{
+ const struct net_device *indev = nft_in(pkt);
+ const struct sk_buff *skb = pkt->skb;
+ struct sock *sk = NULL;
+
+ if (!indev)
+ return NULL;
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, indev);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, indev);
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return sk;
+}
+
static void nft_socket_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -27,50 +117,54 @@ static void nft_socket_eval(const struct nft_expr *expr,
sk = NULL;
if (!sk)
- switch(nft_pf(pkt)) {
- case NFPROTO_IPV4:
- sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
- break;
-#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
- case NFPROTO_IPV6:
- sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
- break;
-#endif
- default:
- WARN_ON_ONCE(1);
- regs->verdict.code = NFT_BREAK;
- return;
- }
+ sk = nft_socket_do_lookup(pkt);
if (!sk) {
regs->verdict.code = NFT_BREAK;
return;
}
- /* So that subsequent socket matching not to require other lookups. */
- skb->sk = sk;
-
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
nft_reg_store8(dest, inet_sk_transparent(sk));
break;
case NFT_SOCKET_MARK:
if (sk_fullsock(sk)) {
- *dest = sk->sk_mark;
+ *dest = READ_ONCE(sk->sk_mark);
} else {
regs->verdict.code = NFT_BREAK;
- return;
+ goto out_put_sk;
+ }
+ break;
+ case NFT_SOCKET_WILDCARD:
+ if (!sk_fullsock(sk)) {
+ regs->verdict.code = NFT_BREAK;
+ goto out_put_sk;
}
+ nft_socket_wildcard(pkt, regs, sk, dest);
break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case NFT_SOCKET_CGROUPV2:
+ if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) {
+ regs->verdict.code = NFT_BREAK;
+ goto out_put_sk;
+ }
+ break;
+#endif
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
}
+
+out_put_sk:
+ if (sk != skb->sk)
+ sock_gen_put(sk);
}
static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = {
- [NFTA_SOCKET_KEY] = { .type = NLA_U32 },
+ [NFTA_SOCKET_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_SOCKET_DREG] = { .type = NLA_U32 },
+ [NFTA_SOCKET_LEVEL] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_socket_init(const struct nft_ctx *ctx,
@@ -94,35 +188,106 @@ static int nft_socket_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_SOCKET_KEY]));
switch(priv->key) {
case NFT_SOCKET_TRANSPARENT:
+ case NFT_SOCKET_WILDCARD:
len = sizeof(u8);
break;
case NFT_SOCKET_MARK:
len = sizeof(u32);
break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case NFT_SOCKET_CGROUPV2: {
+ unsigned int level;
+ int err;
+
+ if (!tb[NFTA_SOCKET_LEVEL])
+ return -EINVAL;
+
+ level = ntohl(nla_get_be32(tb[NFTA_SOCKET_LEVEL]));
+ if (level > 255)
+ return -EOPNOTSUPP;
+
+ err = nft_socket_cgroup_subtree_level();
+ if (err < 0)
+ return err;
+
+ priv->level_user = level;
+
+ level += err;
+ /* Implies a giant cgroup tree */
+ if (level > 255)
+ return -EOPNOTSUPP;
+
+ priv->level = level;
+ len = sizeof(u64);
+ break;
+ }
+#endif
default:
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_SOCKET_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_socket_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_socket *priv = nft_expr_priv(expr);
- if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
+ if (nla_put_be32(skb, NFTA_SOCKET_KEY, htonl(priv->key)))
return -1;
if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
return -1;
+ if (priv->key == NFT_SOCKET_CGROUPV2 &&
+ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user)))
+ return -1;
return 0;
}
+static bool nft_socket_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_socket *priv = nft_expr_priv(expr);
+ const struct nft_socket *socket;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ socket = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != socket->key ||
+ priv->dreg != socket->dreg ||
+ priv->level != socket->level) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
+
+static int nft_socket_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_LOCAL_OUT));
+}
+
static struct nft_expr_type nft_socket_type;
static const struct nft_expr_ops nft_socket_ops = {
.type = &nft_socket_type,
@@ -130,6 +295,8 @@ static const struct nft_expr_ops nft_socket_ops = {
.eval = nft_socket_eval,
.init = nft_socket_init,
.dump = nft_socket_dump,
+ .validate = nft_socket_validate,
+ .reduce = nft_socket_reduce,
};
static struct nft_expr_type nft_socket_type __read_mostly = {
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
new file mode 100644
index 000000000000..5d3e51825985
--- /dev/null
+++ b/net/netfilter/nft_synproxy.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/netlink.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_synproxy.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_synproxy.h>
+
+struct nft_synproxy {
+ struct nf_synproxy_info info;
+};
+
+static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = {
+ [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 },
+ [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 },
+ [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 },
+};
+
+static void nft_synproxy_tcp_options(struct synproxy_options *opts,
+ const struct tcphdr *tcp,
+ struct synproxy_net *snet,
+ struct nf_synproxy_info *info,
+ const struct nft_synproxy *priv)
+{
+ this_cpu_inc(snet->stats->syn_received);
+ if (tcp->ece && tcp->cwr)
+ opts->options |= NF_SYNPROXY_OPT_ECN;
+
+ opts->options &= priv->info.options;
+ opts->mss_encode = opts->mss_option;
+ opts->mss_option = info->mss;
+ if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP)
+ synproxy_init_timestamp_cookie(info, opts);
+ else
+ opts->options &= ~(NF_SYNPROXY_OPT_WSCALE |
+ NF_SYNPROXY_OPT_SACK_PERM |
+ NF_SYNPROXY_OPT_ECN);
+}
+
+static void nft_synproxy_eval_v4(const struct nft_synproxy *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct tcphdr *tcp,
+ struct tcphdr *_tcph,
+ struct synproxy_options *opts)
+{
+ struct nf_synproxy_info info = priv->info;
+ struct net *net = nft_net(pkt);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *skb = pkt->skb;
+
+ if (tcp->syn) {
+ /* Initial SYN from client */
+ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+ synproxy_send_client_synack(net, skb, tcp, opts);
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else if (tcp->ack) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack(net, skb, tcp, opts,
+ ntohl(tcp->seq))) {
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else {
+ regs->verdict.code = NF_DROP;
+ }
+ }
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_synproxy_eval_v6(const struct nft_synproxy *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt,
+ const struct tcphdr *tcp,
+ struct tcphdr *_tcph,
+ struct synproxy_options *opts)
+{
+ struct nf_synproxy_info info = priv->info;
+ struct net *net = nft_net(pkt);
+ struct synproxy_net *snet = synproxy_pernet(net);
+ struct sk_buff *skb = pkt->skb;
+
+ if (tcp->syn) {
+ /* Initial SYN from client */
+ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv);
+ synproxy_send_client_synack_ipv6(net, skb, tcp, opts);
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else if (tcp->ack) {
+ /* ACK from client */
+ if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts,
+ ntohl(tcp->seq))) {
+ consume_skb(skb);
+ regs->verdict.code = NF_STOLEN;
+ } else {
+ regs->verdict.code = NF_DROP;
+ }
+ }
+}
+#endif /* CONFIG_NF_TABLES_IPV6*/
+
+static void nft_synproxy_do_eval(const struct nft_synproxy *priv,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct synproxy_options opts = {};
+ struct sk_buff *skb = pkt->skb;
+ int thoff = nft_thoff(pkt);
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+
+ if (pkt->tprot != IPPROTO_TCP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ tcp = skb_header_pointer(skb, thoff,
+ sizeof(struct tcphdr),
+ &_tcph);
+ if (!tcp) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ if (!synproxy_parse_options(skb, thoff, tcp, &opts)) {
+ regs->verdict.code = NF_DROP;
+ return;
+ }
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts);
+ return;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case htons(ETH_P_IPV6):
+ nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts);
+ return;
+#endif
+ }
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_synproxy *priv)
+{
+ struct synproxy_net *snet = synproxy_pernet(ctx->net);
+ u32 flags;
+ int err;
+
+ if (tb[NFTA_SYNPROXY_MSS])
+ priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS]));
+ if (tb[NFTA_SYNPROXY_WSCALE])
+ priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]);
+ if (tb[NFTA_SYNPROXY_FLAGS]) {
+ flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS]));
+ if (flags & ~NF_SYNPROXY_OPT_MASK)
+ return -EOPNOTSUPP;
+ priv->info.options = flags;
+ }
+
+ err = nf_ct_netns_get(ctx->net, ctx->family);
+ if (err)
+ return err;
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ err = nf_synproxy_ipv4_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ err = nf_synproxy_ipv6_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ break;
+#endif
+ case NFPROTO_INET:
+ err = nf_synproxy_ipv4_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+ err = nf_synproxy_ipv6_init(snet, ctx->net);
+ if (err) {
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ goto nf_ct_failure;
+ }
+ break;
+ }
+
+ return 0;
+
+nf_ct_failure:
+ nf_ct_netns_put(ctx->net, ctx->family);
+ return err;
+}
+
+static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
+{
+ struct synproxy_net *snet = synproxy_pernet(ctx->net);
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nf_synproxy_ipv6_fini(snet, ctx->net);
+ break;
+#endif
+ case NFPROTO_INET:
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ nf_synproxy_ipv6_fini(snet, ctx->net);
+ break;
+ }
+ nf_ct_netns_put(ctx->net, ctx->family);
+}
+
+static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv)
+{
+ if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) ||
+ nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) ||
+ nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_synproxy_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ nft_synproxy_do_eval(priv, regs, pkt);
+}
+
+static int nft_synproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD));
+}
+
+static int nft_synproxy_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
+{
+ struct nft_synproxy *priv = nft_expr_priv(expr);
+
+ return nft_synproxy_do_dump(skb, priv);
+}
+
+static struct nft_expr_type nft_synproxy_type;
+static const struct nft_expr_ops nft_synproxy_ops = {
+ .eval = nft_synproxy_eval,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)),
+ .init = nft_synproxy_init,
+ .destroy = nft_synproxy_destroy,
+ .dump = nft_synproxy_dump,
+ .type = &nft_synproxy_type,
+ .validate = nft_synproxy_validate,
+ .reduce = NFT_REDUCE_READONLY,
+};
+
+static struct nft_expr_type nft_synproxy_type __read_mostly = {
+ .ops = &nft_synproxy_ops,
+ .name = "synproxy",
+ .owner = THIS_MODULE,
+ .policy = nft_synproxy_policy,
+ .maxattr = NFTA_SYNPROXY_MAX,
+};
+
+static int nft_synproxy_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ return nft_synproxy_do_init(ctx, tb, priv);
+}
+
+static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ nft_synproxy_do_destroy(ctx);
+}
+
+static int nft_synproxy_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ return nft_synproxy_do_dump(skb, priv);
+}
+
+static void nft_synproxy_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_synproxy *priv = nft_obj_data(obj);
+
+ nft_synproxy_do_eval(priv, regs, pkt);
+}
+
+static void nft_synproxy_obj_update(struct nft_object *obj,
+ struct nft_object *newobj)
+{
+ struct nft_synproxy *newpriv = nft_obj_data(newobj);
+ struct nft_synproxy *priv = nft_obj_data(obj);
+
+ priv->info = newpriv->info;
+}
+
+static struct nft_object_type nft_synproxy_obj_type;
+static const struct nft_object_ops nft_synproxy_obj_ops = {
+ .type = &nft_synproxy_obj_type,
+ .size = sizeof(struct nft_synproxy),
+ .init = nft_synproxy_obj_init,
+ .destroy = nft_synproxy_obj_destroy,
+ .dump = nft_synproxy_obj_dump,
+ .eval = nft_synproxy_obj_eval,
+ .update = nft_synproxy_obj_update,
+};
+
+static struct nft_object_type nft_synproxy_obj_type __read_mostly = {
+ .type = NFT_OBJECT_SYNPROXY,
+ .ops = &nft_synproxy_obj_ops,
+ .maxattr = NFTA_SYNPROXY_MAX,
+ .policy = nft_synproxy_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_synproxy_module_init(void)
+{
+ int err;
+
+ err = nft_register_obj(&nft_synproxy_obj_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_expr(&nft_synproxy_type);
+ if (err < 0)
+ goto err;
+
+ return 0;
+
+err:
+ nft_unregister_obj(&nft_synproxy_obj_type);
+ return err;
+}
+
+static void __exit nft_synproxy_module_exit(void)
+{
+ nft_unregister_expr(&nft_synproxy_type);
+ nft_unregister_obj(&nft_synproxy_obj_type);
+}
+
+module_init(nft_synproxy_module_init);
+module_exit(nft_synproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("synproxy");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY);
+MODULE_DESCRIPTION("nftables SYNPROXY expression support");
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index f92a82c73880..50481280abd2 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -13,9 +13,9 @@
#endif
struct nft_tproxy {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_port:8;
- u8 family;
+ u8 sreg_addr;
+ u8 sreg_port;
+ u8 family;
};
static void nft_tproxy_eval_v4(const struct nft_expr *expr,
@@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
__be16 tport = 0;
struct sock *sk;
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
if (!hp) {
regs->verdict.code = NFT_BREAK;
@@ -46,11 +52,11 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
if (priv->sreg_addr)
- taddr = regs->data[priv->sreg_addr];
+ taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]);
taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
if (priv->sreg_port)
- tport = regs->data[priv->sreg_port];
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -82,16 +88,17 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
const struct nft_tproxy *priv = nft_expr_priv(expr);
struct sk_buff *skb = pkt->skb;
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct in6_addr taddr;
- int thoff = pkt->xt.thoff;
+ int thoff = nft_thoff(pkt);
struct udphdr _hdr, *hp;
+ struct in6_addr taddr;
__be16 tport = 0;
struct sock *sk;
int l4proto;
memset(&taddr, 0, sizeof(taddr));
- if (!pkt->tprot_set) {
+ if (pkt->tprot != IPPROTO_TCP &&
+ pkt->tprot != IPPROTO_UDP) {
regs->verdict.code = NFT_BREAK;
return;
}
@@ -117,7 +124,7 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
if (priv->sreg_port)
- tport = regs->data[priv->sreg_port];
+ tport = nft_reg_load_be16(&regs->data[priv->sreg_port]);
if (!tport)
tport = hp->dest;
@@ -176,7 +183,7 @@ static void nft_tproxy_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
- [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
+ [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
[NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
};
@@ -218,14 +225,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
switch (priv->family) {
case NFPROTO_IPV4:
- alen = FIELD_SIZEOF(union nf_inet_addr, in);
+ alen = sizeof_field(union nf_inet_addr, in);
err = nf_defrag_ipv4_enable(ctx->net);
if (err)
return err;
break;
#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
- alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+ alen = sizeof_field(union nf_inet_addr, in6);
err = nf_defrag_ipv6_enable(ctx->net);
if (err)
return err;
@@ -247,15 +254,15 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
}
if (tb[NFTA_TPROXY_REG_ADDR]) {
- priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, alen);
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR],
+ &priv->sreg_addr, alen);
if (err < 0)
return err;
}
if (tb[NFTA_TPROXY_REG_PORT]) {
- priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
- err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+ err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT],
+ &priv->sreg_port, sizeof(u16));
if (err < 0)
return err;
}
@@ -263,8 +270,31 @@ static int nft_tproxy_init(const struct nft_ctx *ctx,
return 0;
}
+static void nft_tproxy_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ nf_defrag_ipv4_disable(ctx->net);
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ nf_defrag_ipv6_disable(ctx->net);
+ break;
+#endif
+ case NFPROTO_UNSPEC:
+ nf_defrag_ipv4_disable(ctx->net);
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ nf_defrag_ipv6_disable(ctx->net);
+#endif
+ break;
+ }
+}
+
static int nft_tproxy_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tproxy *priv = nft_expr_priv(expr);
@@ -282,13 +312,27 @@ static int nft_tproxy_dump(struct sk_buff *skb,
return 0;
}
+static int nft_tproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
+ return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+}
+
static struct nft_expr_type nft_tproxy_type;
static const struct nft_expr_ops nft_tproxy_ops = {
.type = &nft_tproxy_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
.eval = nft_tproxy_eval,
.init = nft_tproxy_init,
+ .destroy = nft_tproxy_destroy,
.dump = nft_tproxy_dump,
+ .reduce = NFT_REDUCE_READONLY,
+ .validate = nft_tproxy_validate,
};
static struct nft_expr_type nft_tproxy_type __read_mostly = {
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 3a15f219e4e7..a12486ae089d 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -11,10 +11,13 @@
#include <net/ip_tunnels.h>
#include <net/vxlan.h>
#include <net/erspan.h>
+#include <net/geneve.h>
struct nft_tunnel {
enum nft_tunnel_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
+ enum nft_tunnel_mode mode:8;
+ u8 len;
};
static void nft_tunnel_get_eval(const struct nft_expr *expr,
@@ -29,14 +32,32 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
switch (priv->key) {
case NFT_TUNNEL_PATH:
- nft_reg_store8(dest, !!tun_info);
+ if (!tun_info) {
+ nft_reg_store8(dest, false);
+ return;
+ }
+ if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+ (priv->mode == NFT_TUNNEL_MODE_RX &&
+ !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+ (priv->mode == NFT_TUNNEL_MODE_TX &&
+ (tun_info->mode & IP_TUNNEL_INFO_TX)))
+ nft_reg_store8(dest, true);
+ else
+ nft_reg_store8(dest, false);
break;
case NFT_TUNNEL_ID:
if (!tun_info) {
regs->verdict.code = NFT_BREAK;
return;
}
- *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+ if (priv->mode == NFT_TUNNEL_MODE_NONE ||
+ (priv->mode == NFT_TUNNEL_MODE_RX &&
+ !(tun_info->mode & IP_TUNNEL_INFO_TX)) ||
+ (priv->mode == NFT_TUNNEL_MODE_TX &&
+ (tun_info->mode & IP_TUNNEL_INFO_TX)))
+ *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+ else
+ regs->verdict.code = NFT_BREAK;
break;
default:
WARN_ON(1);
@@ -45,8 +66,9 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
- [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_MODE] = NLA_POLICY_MAX(NLA_BE32, 255),
};
static int nft_tunnel_get_init(const struct nft_ctx *ctx,
@@ -56,7 +78,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
struct nft_tunnel *priv = nft_expr_priv(expr);
u32 len;
- if (!tb[NFTA_TUNNEL_KEY] &&
+ if (!tb[NFTA_TUNNEL_KEY] ||
!tb[NFTA_TUNNEL_DREG])
return -EINVAL;
@@ -72,14 +94,21 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
+ if (tb[NFTA_TUNNEL_MODE]) {
+ priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE]));
+ if (priv->mode > NFT_TUNNEL_MODE_MAX)
+ return -EOPNOTSUPP;
+ } else {
+ priv->mode = NFT_TUNNEL_MODE_NONE;
+ }
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_TUNNEL_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
static int nft_tunnel_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_tunnel *priv = nft_expr_priv(expr);
@@ -87,12 +116,39 @@ static int nft_tunnel_get_dump(struct sk_buff *skb,
goto nla_put_failure;
if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
goto nla_put_failure;
+ if (nla_put_be32(skb, NFTA_TUNNEL_MODE, htonl(priv->mode)))
+ goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
+static bool nft_tunnel_get_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ const struct nft_tunnel *tunnel;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ tunnel = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != tunnel->key ||
+ priv->dreg != tunnel->dreg ||
+ priv->mode != tunnel->mode) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return false;
+}
+
static struct nft_expr_type nft_tunnel_type;
static const struct nft_expr_ops nft_tunnel_get_ops = {
.type = &nft_tunnel_type,
@@ -100,10 +156,12 @@ static const struct nft_expr_ops nft_tunnel_get_ops = {
.eval = nft_tunnel_get_eval,
.init = nft_tunnel_get_init,
.dump = nft_tunnel_get_dump,
+ .reduce = nft_tunnel_get_reduce,
};
static struct nft_expr_type nft_tunnel_type __read_mostly = {
.name = "tunnel",
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_get_ops,
.policy = nft_tunnel_policy,
.maxattr = NFTA_TUNNEL_MAX,
@@ -114,9 +172,10 @@ struct nft_tunnel_opts {
union {
struct vxlan_metadata vxlan;
struct erspan_metadata erspan;
+ u8 data[IP_TUNNEL_OPTS_MAX];
} u;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
u32 len;
- __be16 flags;
};
struct nft_tunnel_obj {
@@ -136,8 +195,8 @@ static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx,
struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
- nft_tunnel_ip_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
+ nft_tunnel_ip_policy, NULL);
if (err < 0)
return err;
@@ -165,8 +224,8 @@ static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx,
struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
- nft_tunnel_ip6_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
+ nft_tunnel_ip6_policy, NULL);
if (err < 0)
return err;
@@ -201,8 +260,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1];
int err;
- err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
- nft_tunnel_opts_vxlan_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
+ nft_tunnel_opts_vxlan_policy, NULL);
if (err < 0)
return err;
@@ -212,14 +271,16 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
opts->len = sizeof(struct vxlan_metadata);
- opts->flags = TUNNEL_VXLAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags);
return 0;
}
static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 },
[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 },
- [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 },
};
@@ -230,11 +291,15 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
uint8_t hwid, dir;
int err, version;
- err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr,
- nft_tunnel_opts_erspan_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX,
+ attr, nft_tunnel_opts_erspan_policy,
+ NULL);
if (err < 0)
return err;
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION])
+ return -EINVAL;
+
version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
switch (version) {
case ERSPAN_VERSION:
@@ -261,14 +326,60 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
opts->u.erspan.version = version;
opts->len = sizeof(struct erspan_metadata);
- opts->flags = TUNNEL_ERSPAN_OPT;
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags);
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
+ [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
+};
+
+static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len);
+ struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
+ int err, data_len;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr,
+ nft_tunnel_opts_geneve_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ opts->len += sizeof(*opt) + data_len;
+ if (opts->len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
+ ip_tunnel_flags_zero(opts->flags);
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags);
return 0;
}
static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = {
+ .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE },
[NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
[NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, },
};
static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
@@ -276,22 +387,44 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
- int err;
+ struct nlattr *nla;
+ int err, rem;
+ u32 type = 0;
- err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
- nft_tunnel_opts_policy, NULL);
+ err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
+ nft_tunnel_opts_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
- err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
- opts);
- } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
- err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
- opts);
- } else {
- return -EOPNOTSUPP;
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case NFTA_TUNNEL_KEY_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_vxlan_init(nla, opts);
+ if (err)
+ return err;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_erspan_init(nla, opts);
+ if (err)
+ return err;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_GENEVE:
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
+ return -EINVAL;
+ err = nft_tunnel_obj_geneve_init(nla, opts);
+ if (err)
+ return err;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
}
return err;
@@ -304,6 +437,8 @@ static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] =
[NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, },
[NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, },
[NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, },
+ [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, },
[NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, },
};
@@ -322,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
memset(&info, 0, sizeof(info));
info.mode = IP_TUNNEL_INFO_TX;
info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
- info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ __set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags);
if (tb[NFTA_TUNNEL_KEY_IP]) {
err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
@@ -351,18 +488,16 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
- info.key.tun_flags &= ~TUNNEL_CSUM;
+ __clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
- info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+ info.key.tun_flags);
if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
- info.key.tun_flags |= TUNNEL_SEQ;
+ __set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags);
}
if (tb[NFTA_TUNNEL_KEY_TOS])
info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
- if (tb[NFTA_TUNNEL_KEY_TTL])
- info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
- else
- info.key.ttl = U8_MAX;
+ info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX);
if (tb[NFTA_TUNNEL_KEY_OPTS]) {
err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
@@ -371,11 +506,19 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
return err;
}
- md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL,
+ GFP_KERNEL_ACCOUNT);
if (!md)
return -ENOMEM;
memcpy(&md->u.tun_info, &info, sizeof(info));
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT);
+ if (err < 0) {
+ metadata_dst_free(md);
+ return err;
+ }
+#endif
ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len,
priv->opts.flags);
priv->md = md;
@@ -400,24 +543,33 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
struct nlattr *nest;
if (info->mode & IP_TUNNEL_INFO_IPV6) {
- nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6);
+ nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP6);
if (!nest)
return -1;
- if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
- nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
- nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
+ if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC,
+ &info->key.u.ipv6.src) < 0 ||
+ nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST,
+ &info->key.u.ipv6.dst) < 0 ||
+ nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL,
+ info->key.label)) {
+ nla_nest_cancel(skb, nest);
return -1;
+ }
nla_nest_end(skb, nest);
} else {
- nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP);
+ nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_IP);
if (!nest)
return -1;
- if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
- nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
+ if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC,
+ info->key.u.ipv4.src) < 0 ||
+ nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST,
+ info->key.u.ipv4.dst) < 0) {
+ nla_nest_cancel(skb, nest);
return -1;
+ }
nla_nest_end(skb, nest);
}
@@ -429,42 +581,77 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
struct nft_tunnel_obj *priv)
{
struct nft_tunnel_opts *opts = &priv->opts;
- struct nlattr *nest;
+ struct nlattr *nest, *inner;
- nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS);
+ nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS);
if (!nest)
return -1;
- if (opts->flags & TUNNEL_VXLAN_OPT) {
+ if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN);
+ if (!inner)
+ goto failure;
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
htonl(opts->u.vxlan.gbp)))
- return -1;
- } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ goto inner_failure;
+ nla_nest_end(skb, inner);
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN);
+ if (!inner)
+ goto failure;
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION,
+ htonl(opts->u.erspan.version)))
+ goto inner_failure;
switch (opts->u.erspan.version) {
case ERSPAN_VERSION:
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
opts->u.erspan.u.index))
- return -1;
+ goto inner_failure;
break;
case ERSPAN_VERSION2:
if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
get_hwid(&opts->u.erspan.u.md2)) ||
nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
opts->u.erspan.u.md2.dir))
- return -1;
+ goto inner_failure;
break;
}
+ nla_nest_end(skb, inner);
+ } else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ while (opts->len > offset) {
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ opt = (struct geneve_opt *)(opts->u.data + offset);
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto inner_failure;
+ offset += sizeof(*opt) + opt->length * 4;
+ nla_nest_end(skb, inner);
+ }
}
nla_nest_end(skb, nest);
-
return 0;
+
+inner_failure:
+ nla_nest_cancel(skb, inner);
+failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
}
static int nft_tunnel_ports_dump(struct sk_buff *skb,
struct ip_tunnel_info *info)
{
- if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
- nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 ||
+ nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0)
return -1;
return 0;
@@ -475,11 +662,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb,
{
u32 flags = 0;
- if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
- if (!(info->key.tun_flags & TUNNEL_CSUM))
+ if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
- if (info->key.tun_flags & TUNNEL_SEQ)
+ if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags))
flags |= NFT_TUNNEL_F_SEQ_NUMBER;
if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
@@ -530,6 +717,7 @@ static const struct nft_object_ops nft_tunnel_obj_ops = {
static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
.type = NFT_OBJECT_TUNNEL,
+ .family = NFPROTO_NETDEV,
.ops = &nft_tunnel_obj_ops,
.maxattr = NFTA_TUNNEL_KEY_MAX,
.policy = nft_tunnel_key_policy,
@@ -564,3 +752,4 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("tunnel");
MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
+MODULE_DESCRIPTION("nftables tunnel expression support");
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index b08865ec5ed3..3210cfc966ab 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -1,7 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*
* Generic part shared by ipv4 and ipv6 backends.
*/
@@ -18,17 +16,18 @@
#include <net/xfrm.h>
static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = {
- [NFTA_XFRM_KEY] = { .type = NLA_U32 },
+ [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DIR] = { .type = NLA_U8 },
- [NFTA_XFRM_SPNUM] = { .type = NLA_U32 },
+ [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_XFRM_DREG] = { .type = NLA_U32 },
};
struct nft_xfrm {
enum nft_xfrm_keys key:8;
- enum nft_registers dreg:8;
+ u8 dreg;
u8 dir;
u8 spnum;
+ u8 len;
};
static int nft_xfrm_get_init(const struct nft_ctx *ctx,
@@ -52,7 +51,7 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
return -EOPNOTSUPP;
}
- priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY]));
+ priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY]));
switch (priv->key) {
case NFT_XFRM_KEY_REQID:
case NFT_XFRM_KEY_SPI:
@@ -88,9 +87,9 @@ static int nft_xfrm_get_init(const struct nft_ctx *ctx,
priv->spnum = spnum;
- priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]);
- return nft_validate_register_store(ctx, priv->dreg, NULL,
- NFT_DATA_VALUE, len);
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
}
/* Return true if key asks for daddr/saddr and current
@@ -113,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
return true;
}
- return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
+ mode == XFRM_MODE_IPTFS;
}
static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
@@ -135,13 +135,13 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
WARN_ON_ONCE(1);
break;
case NFT_XFRM_KEY_DADDR_IP4:
- *dest = state->id.daddr.a4;
+ *dest = (__force __u32)state->id.daddr.a4;
return;
case NFT_XFRM_KEY_DADDR_IP6:
memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr));
return;
case NFT_XFRM_KEY_SADDR_IP4:
- *dest = state->props.saddr.a4;
+ *dest = (__force __u32)state->props.saddr.a4;
return;
case NFT_XFRM_KEY_SADDR_IP6:
memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr));
@@ -150,7 +150,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
*dest = state->props.reqid;
return;
case NFT_XFRM_KEY_SPI:
- *dest = state->id.spi;
+ *dest = (__force __u32)state->id.spi;
return;
}
@@ -213,7 +213,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
}
static int nft_xfrm_get_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
+ const struct nft_expr *expr, bool reset)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
@@ -230,12 +230,16 @@ static int nft_xfrm_get_dump(struct sk_buff *skb,
return 0;
}
-static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
- const struct nft_data **data)
+static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
{
const struct nft_xfrm *priv = nft_expr_priv(expr);
unsigned int hooks;
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
+ ctx->family != NFPROTO_INET)
+ return -EOPNOTSUPP;
+
switch (priv->dir) {
case XFRM_POLICY_IN:
hooks = (1 << NF_INET_FORWARD) |
@@ -255,6 +259,31 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
return nft_chain_validate_hooks(ctx->chain, hooks);
}
+static bool nft_xfrm_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ const struct nft_xfrm *xfrm;
+
+ if (!nft_reg_track_cmp(track, expr, priv->dreg)) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ xfrm = nft_expr_priv(track->regs[priv->dreg].selector);
+ if (priv->key != xfrm->key ||
+ priv->dreg != xfrm->dreg ||
+ priv->dir != xfrm->dir ||
+ priv->spnum != xfrm->spnum) {
+ nft_reg_track_update(track, expr, priv->dreg, priv->len);
+ return false;
+ }
+
+ if (!track->regs[priv->dreg].bitwise)
+ return true;
+
+ return nft_expr_reduce_bitwise(track, expr);
+}
static struct nft_expr_type nft_xfrm_type;
static const struct nft_expr_ops nft_xfrm_get_ops = {
@@ -264,6 +293,7 @@ static const struct nft_expr_ops nft_xfrm_get_ops = {
.init = nft_xfrm_get_init,
.dump = nft_xfrm_get_dump,
.validate = nft_xfrm_validate,
+ .reduce = nft_xfrm_reduce,
};
static struct nft_expr_type nft_xfrm_type __read_mostly = {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index e8da9a9bba73..008419db815a 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -17,16 +17,17 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP &&
+ !csum_fold(skb->csum)) ||
!csum_tcpudp_magic(iph->saddr, iph->daddr,
skb->len - dataoff, protocol,
skb->csum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
- if (protocol == 0)
+ if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP)
skb->csum = 0;
else
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
@@ -50,7 +51,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
skb->len - dataoff, 0);
@@ -78,7 +79,7 @@ __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
skb->csum = ~csum_unfold(
csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
@@ -105,7 +106,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
case CHECKSUM_COMPLETE:
if (len == skb->len - dataoff)
return nf_ip6_checksum(skb, hook, dataoff, protocol);
- /* fall through */
+ fallthrough;
case CHECKSUM_NONE:
hsum = skb_checksum(skb, 0, dataoff, 0);
skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
@@ -162,7 +163,7 @@ EXPORT_SYMBOL_GPL(nf_checksum_partial);
int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict, unsigned short family)
{
- const struct nf_ipv6_ops *v6ops;
+ const struct nf_ipv6_ops *v6ops __maybe_unused;
int ret = 0;
switch (family) {
@@ -170,9 +171,7 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
ret = nf_ip_route(net, dst, fl, strict);
break;
case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- ret = v6ops->route(net, dst, fl, strict);
+ ret = nf_ip6_route(net, dst, fl, strict);
break;
}
@@ -180,20 +179,54 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
}
EXPORT_SYMBOL_GPL(nf_route);
-int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
+/* Only get and check the lengths, not do any hop-by-hop stuff. */
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
{
- const struct nf_ipv6_ops *v6ops;
- int ret = 0;
-
- switch (entry->state.pf) {
- case AF_INET:
- ret = nf_ip_reroute(skb, entry);
- break;
- case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- ret = v6ops->reroute(skb, entry);
- break;
+ int len, off = sizeof(struct ipv6hdr);
+ unsigned char *nh;
+
+ if (!pskb_may_pull(skb, off + 8))
+ return -ENOMEM;
+ nh = (unsigned char *)(ipv6_hdr(skb) + 1);
+ len = (nh[1] + 1) << 3;
+
+ if (!pskb_may_pull(skb, off + len))
+ return -ENOMEM;
+ nh = skb_network_header(skb);
+
+ off += 2;
+ len -= 2;
+ while (len > 0) {
+ int optlen;
+
+ if (nh[off] == IPV6_TLV_PAD1) {
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ return -EBADMSG;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ return -EBADMSG;
+
+ if (nh[off] == IPV6_TLV_JUMBO) {
+ u32 pkt_len;
+
+ if (nh[off + 1] != 4 || (off & 3) != 2)
+ return -EBADMSG;
+ pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+ if (pkt_len <= IPV6_MAXPLEN ||
+ ipv6_hdr(skb)->payload_len)
+ return -EBADMSG;
+ if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+ return -EBADMSG;
+ *plen = pkt_len;
+ }
+ off += optlen;
+ len -= optlen;
}
- return ret;
+
+ return len ? -EBADMSG : 0;
}
+EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index aecadd471e1d..90b7630421c4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* x_tables core - Backend for {ip,ip6,arp}_tables
*
@@ -7,11 +8,6 @@
* Based on existing ip_tables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
@@ -28,6 +24,7 @@
#include <linux/audit.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp.h>
@@ -42,6 +39,24 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
#define XT_PCPU_BLOCK_SIZE 4096
#define XT_MAX_TABLE_SIZE (512 * 1024 * 1024)
+struct xt_template {
+ struct list_head list;
+
+ /* called when table is needed in the given netns */
+ int (*table_init)(struct net *net);
+
+ struct module *me;
+
+ /* A unique name... */
+ char name[XT_TABLE_MAXNAMELEN];
+};
+
+static struct list_head xt_templates[NFPROTO_NUMPROTO];
+
+struct xt_pernet {
+ struct list_head tables[NFPROTO_NUMPROTO];
+};
+
struct compat_delta {
unsigned int offset; /* offset in kernel */
int delta; /* delta in 32bit user land */
@@ -51,7 +66,7 @@ struct xt_af {
struct mutex mutex;
struct list_head match;
struct list_head target;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct mutex compat_mutex;
struct compat_delta *compat_tab;
unsigned int number; /* number of slots in compat_tab[] */
@@ -59,7 +74,8 @@ struct xt_af {
#endif
};
-static struct xt_af *xt;
+static unsigned int xt_pernet_id __read_mostly;
+static struct xt_af *xt __read_mostly;
static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_UNSPEC] = "x",
@@ -227,7 +243,7 @@ xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision)
EXPORT_SYMBOL_GPL(xt_request_find_match);
/* Find target, grabs ref. Returns ERR_PTR() on error. */
-struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
+static struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
{
struct xt_target *t;
int err = -ENOENT;
@@ -255,7 +271,6 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
return ERR_PTR(err);
}
-EXPORT_SYMBOL(xt_find_target);
struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision)
{
@@ -335,6 +350,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_match *m;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(m, &xt[af].match, list) {
if (strcmp(m->name, name) == 0) {
if (m->revision > *bestp)
@@ -343,6 +359,7 @@ static int match_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return match_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -355,6 +372,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
const struct xt_target *t;
int have_rev = 0;
+ mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &xt[af].target, list) {
if (strcmp(t->name, name) == 0) {
if (t->revision > *bestp)
@@ -363,6 +381,7 @@ static int target_revfn(u8 af, const char *name, u8 revision, int *bestp)
have_rev = 1;
}
}
+ mutex_unlock(&xt[af].mutex);
if (af != NFPROTO_UNSPEC && !have_rev)
return target_revfn(NFPROTO_UNSPEC, name, revision, bestp);
@@ -376,12 +395,10 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target,
{
int have_rev, best = -1;
- mutex_lock(&xt[af].mutex);
if (target == 1)
have_rev = target_revfn(af, name, revision, &best);
else
have_rev = match_revfn(af, name, revision, &best);
- mutex_unlock(&xt[af].mutex);
/* Nothing at all? Return 0 to try loading module. */
if (best == -1) {
@@ -461,7 +478,7 @@ int xt_check_proc_name(const char *name, unsigned int size)
EXPORT_SYMBOL(xt_check_proc_name);
int xt_check_match(struct xt_mtchk_param *par,
- unsigned int size, u_int8_t proto, bool inv_proto)
+ unsigned int size, u16 proto, bool inv_proto)
{
int ret;
@@ -644,7 +661,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
return usersize == kernsize && strnlen(msg, msglen) < msglen;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
{
struct xt_af *xp = &xt[af];
@@ -736,7 +753,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
{
const struct xt_match *match = m->u.kernel.match;
struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
- int pad, off = xt_compat_match_offset(match);
+ int off = xt_compat_match_offset(match);
u_int16_t msize = cm->u.user.match_size;
char name[sizeof(m->u.user.name)];
@@ -746,15 +763,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
match->compat_from_user(m->data, cm->data);
else
memcpy(m->data, cm->data, msize - sizeof(*cm));
- pad = XT_ALIGN(match->matchsize) - match->matchsize;
- if (pad > 0)
- memset(m->data + match->matchsize, 0, pad);
msize += off;
m->u.user.match_size = msize;
- strlcpy(name, match->name, sizeof(name));
+ strscpy(name, match->name, sizeof(name));
module_put(match->me);
- strncpy(m->u.user.name, name, sizeof(m->u.user.name));
+ strscpy_pad(m->u.user.name, name, sizeof(m->u.user.name));
*size += off;
*dstptr += msize;
@@ -850,7 +864,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
__alignof__(struct compat_xt_entry_match));
}
EXPORT_SYMBOL(xt_compat_check_entry_offsets);
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
/**
* xt_check_entry_offsets - validate arp/ip/ip6t_entry
@@ -868,7 +882,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets);
* match structures are aligned, and that the last structure ends where
* the target structure begins.
*
- * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version.
+ * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version.
*
* The arp/ip/ip6t_entry structure @base must have passed following tests:
* - it must point to a valid memory location
@@ -944,14 +958,14 @@ EXPORT_SYMBOL(xt_check_entry_offsets);
*
* @size: number of entries
*
- * Return: NULL or kmalloc'd or vmalloc'd array
+ * Return: NULL or zeroed kmalloc'd or vmalloc'd array
*/
unsigned int *xt_alloc_entry_offsets(unsigned int size)
{
if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int))
return NULL;
- return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO);
+ return kvcalloc(size, sizeof(unsigned int), GFP_KERNEL);
}
EXPORT_SYMBOL(xt_alloc_entry_offsets);
@@ -984,7 +998,7 @@ bool xt_find_jump_offset(const unsigned int *offsets,
EXPORT_SYMBOL(xt_find_jump_offset);
int xt_check_target(struct xt_tgchk_param *par,
- unsigned int size, u_int8_t proto, bool inv_proto)
+ unsigned int size, u16 proto, bool inv_proto)
{
int ret;
@@ -1033,34 +1047,34 @@ int xt_check_target(struct xt_tgchk_param *par,
EXPORT_SYMBOL_GPL(xt_check_target);
/**
- * xt_copy_counters_from_user - copy counters and metadata from userspace
+ * xt_copy_counters - copy counters and metadata from a sockptr_t
*
- * @user: src pointer to userspace memory
+ * @arg: src sockptr
* @len: alleged size of userspace memory
* @info: where to store the xt_counters_info metadata
- * @compat: true if we setsockopt call is done by 32bit task on 64bit kernel
*
* Copies counter meta data from @user and stores it in @info.
*
* vmallocs memory to hold the counters, then copies the counter data
* from @user to the new memory and returns a pointer to it.
*
- * If @compat is true, @info gets converted automatically to the 64bit
- * representation.
+ * If called from a compat syscall, @info gets converted automatically to the
+ * 64bit representation.
*
* The metadata associated with the counters is stored in @info.
*
* Return: returns pointer that caller has to test via IS_ERR().
* If IS_ERR is false, caller has to vfree the pointer.
*/
-void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
- struct xt_counters_info *info, bool compat)
+void *xt_copy_counters(sockptr_t arg, unsigned int len,
+ struct xt_counters_info *info)
{
+ size_t offset;
void *mem;
u64 size;
-#ifdef CONFIG_COMPAT
- if (compat) {
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall()) {
/* structures only differ in size due to alignment */
struct compat_xt_counters_info compat_tmp;
@@ -1068,12 +1082,12 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(compat_tmp);
- if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
+ if (copy_from_sockptr(&compat_tmp, arg, sizeof(compat_tmp)) != 0)
return ERR_PTR(-EFAULT);
memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
info->num_counters = compat_tmp.num_counters;
- user += sizeof(compat_tmp);
+ offset = sizeof(compat_tmp);
} else
#endif
{
@@ -1081,10 +1095,10 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
return ERR_PTR(-EINVAL);
len -= sizeof(*info);
- if (copy_from_user(info, user, sizeof(*info)) != 0)
+ if (copy_from_sockptr(info, arg, sizeof(*info)) != 0)
return ERR_PTR(-EFAULT);
- user += sizeof(*info);
+ offset = sizeof(*info);
}
info->name[sizeof(info->name) - 1] = '\0';
@@ -1098,15 +1112,15 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
if (!mem)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(mem, user, len) == 0)
+ if (copy_from_sockptr_offset(mem, arg, offset, len) == 0)
return mem;
vfree(mem);
return ERR_PTR(-EFAULT);
}
-EXPORT_SYMBOL_GPL(xt_copy_counters_from_user);
+EXPORT_SYMBOL_GPL(xt_copy_counters);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
int xt_compat_target_offset(const struct xt_target *target)
{
u_int16_t csize = target->compatsize ? : target->targetsize;
@@ -1119,7 +1133,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
{
const struct xt_target *target = t->u.kernel.target;
struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
- int pad, off = xt_compat_target_offset(target);
+ int off = xt_compat_target_offset(target);
u_int16_t tsize = ct->u.user.target_size;
char name[sizeof(t->u.user.name)];
@@ -1128,16 +1142,14 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
if (target->compat_from_user)
target->compat_from_user(t->data, ct->data);
else
- memcpy(t->data, ct->data, tsize - sizeof(*ct));
- pad = XT_ALIGN(target->targetsize) - target->targetsize;
- if (pad > 0)
- memset(t->data + target->targetsize, 0, pad);
+ unsafe_memcpy(t->data, ct->data, tsize - sizeof(*ct),
+ /* UAPI 0-sized destination */);
tsize += off;
t->u.user.target_size = tsize;
- strlcpy(name, target->name, sizeof(name));
+ strscpy(name, target->name, sizeof(name));
module_put(target->me);
- strncpy(t->u.user.name, name, sizeof(t->u.user.name));
+ strscpy_pad(t->u.user.name, name, sizeof(t->u.user.name));
*size += off;
*dstptr += tsize;
@@ -1202,50 +1214,65 @@ void xt_free_table_info(struct xt_table_info *info)
}
EXPORT_SYMBOL(xt_free_table_info);
+struct xt_table *xt_find_table(struct net *net, u8 af, const char *name)
+{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct xt_table *t;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_net->tables[af], list) {
+ if (strcmp(t->name, name) == 0) {
+ mutex_unlock(&xt[af].mutex);
+ return t;
+ }
+ }
+ mutex_unlock(&xt[af].mutex);
+ return NULL;
+}
+EXPORT_SYMBOL(xt_find_table);
+
/* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
- struct xt_table *t, *found = NULL;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
+ struct module *owner = NULL;
+ struct xt_template *tmpl;
+ struct xt_table *t;
mutex_lock(&xt[af].mutex);
- list_for_each_entry(t, &net->xt.tables[af], list)
+ list_for_each_entry(t, &xt_net->tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
- if (net == &init_net)
- goto out;
-
- /* Table doesn't exist in this netns, re-try init */
- list_for_each_entry(t, &init_net.xt.tables[af], list) {
+ /* Table doesn't exist in this netns, check larval list */
+ list_for_each_entry(tmpl, &xt_templates[af], list) {
int err;
- if (strcmp(t->name, name))
+ if (strcmp(tmpl->name, name))
continue;
- if (!try_module_get(t->me))
+ if (!try_module_get(tmpl->me))
goto out;
+
+ owner = tmpl->me;
+
mutex_unlock(&xt[af].mutex);
- err = t->table_init(net);
+ err = tmpl->table_init(net);
if (err < 0) {
- module_put(t->me);
+ module_put(owner);
return ERR_PTR(err);
}
- found = t;
-
mutex_lock(&xt[af].mutex);
break;
}
- if (!found)
- goto out;
-
/* and once again: */
- list_for_each_entry(t, &net->xt.tables[af], list)
- if (strcmp(t->name, name) == 0)
+ list_for_each_entry(t, &xt_net->tables[af], list)
+ if (strcmp(t->name, name) == 0 && owner == t->me)
return t;
- module_put(found->me);
+ module_put(owner);
out:
mutex_unlock(&xt[af].mutex);
return ERR_PTR(-ENOENT);
@@ -1276,7 +1303,7 @@ void xt_table_unlock(struct xt_table *table)
}
EXPORT_SYMBOL_GPL(xt_table_unlock);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
void xt_compat_lock(u_int8_t af)
{
mutex_lock(&xt[af].compat_mutex);
@@ -1290,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af)
EXPORT_SYMBOL_GPL(xt_compat_unlock);
#endif
-DEFINE_PER_CPU(seqcount_t, xt_recseq);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
-
struct static_key xt_tee_enabled __read_mostly;
EXPORT_SYMBOL_GPL(xt_tee_enabled);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
+
static int xt_jumpstack_alloc(struct xt_table_info *i)
{
unsigned int size;
@@ -1392,7 +1420,7 @@ xt_replace_table(struct xt_table *table,
table->private = newinfo;
/* make sure all cpus see new ->private value */
- smp_wmb();
+ smp_mb();
/*
* Even though table entries have now been swapped, other CPU's
@@ -1413,15 +1441,10 @@ xt_replace_table(struct xt_table *table,
}
}
-#ifdef CONFIG_AUDIT
- if (audit_enabled) {
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_NETFILTER_CFG,
- "table=%s family=%u entries=%u",
- table->name, table->af, private->number);
- }
-#endif
-
+ audit_log_nfcfg(table->name, table->af, private->number,
+ !private->number ? AUDIT_XT_OP_REGISTER :
+ AUDIT_XT_OP_REPLACE,
+ GFP_KERNEL);
return private;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1431,9 +1454,10 @@ struct xt_table *xt_register_table(struct net *net,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
- int ret;
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
struct xt_table_info *private;
struct xt_table *t, *table;
+ int ret;
/* Don't add one object to multiple lists. */
table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
@@ -1444,7 +1468,7 @@ struct xt_table *xt_register_table(struct net *net,
mutex_lock(&xt[table->af].mutex);
/* Don't autoload: we'd eat our tail... */
- list_for_each_entry(t, &net->xt.tables[table->af], list) {
+ list_for_each_entry(t, &xt_net->tables[table->af], list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
goto unlock;
@@ -1463,7 +1487,7 @@ struct xt_table *xt_register_table(struct net *net,
/* save number of initial entries */
private->initial_entries = private->number;
- list_add(&table->list, &net->xt.tables[table->af]);
+ list_add(&table->list, &xt_net->tables[table->af]);
mutex_unlock(&xt[table->af].mutex);
return table;
@@ -1483,33 +1507,43 @@ void *xt_unregister_table(struct xt_table *table)
private = table->private;
list_del(&table->list);
mutex_unlock(&xt[table->af].mutex);
+ audit_log_nfcfg(table->name, table->af, private->number,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
+ kfree(table->ops);
kfree(table);
return private;
}
EXPORT_SYMBOL_GPL(xt_unregister_table);
+#endif
#ifdef CONFIG_PROC_FS
static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
+
+ xt_net = net_generic(net, xt_pernet_id);
mutex_lock(&xt[af].mutex);
- return seq_list_start(&net->xt.tables[af], *pos);
+ return seq_list_start(&xt_net->tables[af], *pos);
}
static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ u8 af = (unsigned long)pde_data(file_inode(seq->file));
struct net *net = seq_file_net(seq);
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ struct xt_pernet *xt_net;
+
+ xt_net = net_generic(net, xt_pernet_id);
- return seq_list_next(v, &net->xt.tables[af], pos);
+ return seq_list_next(v, &xt_net->tables[af], pos);
}
static void xt_table_seq_stop(struct seq_file *seq, void *v)
{
- u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file));
+ u_int8_t af = (unsigned long)pde_data(file_inode(seq->file));
mutex_unlock(&xt[af].mutex);
}
@@ -1553,9 +1587,12 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
[MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
[MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
};
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
+ if (ppos != NULL)
+ ++(*ppos);
+
switch (trav->class) {
case MTTG_TRAV_INIT:
trav->class = MTTG_TRAV_NFP_UNSPEC;
@@ -1577,13 +1614,10 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
trav->curr = trav->curr->next;
if (trav->curr != trav->head)
break;
- /* fall through */
+ fallthrough;
default:
return NULL;
}
-
- if (ppos != NULL)
- ++*ppos;
return trav;
}
@@ -1602,7 +1636,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
{
- uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
+ uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
switch (trav->class) {
@@ -1727,6 +1761,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
}
EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
+int xt_register_template(const struct xt_table *table,
+ int (*table_init)(struct net *net))
+{
+ int ret = -EEXIST, af = table->af;
+ struct xt_template *t;
+
+ mutex_lock(&xt[af].mutex);
+
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0))
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out_unlock;
+
+ BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name));
+
+ strscpy(t->name, table->name, sizeof(t->name));
+ t->table_init = table_init;
+ t->me = table->me;
+ list_add(&t->list, &xt_templates[af]);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&xt[af].mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xt_register_template);
+
+void xt_unregister_template(const struct xt_table *table)
+{
+ struct xt_template *t;
+ int af = table->af;
+
+ mutex_lock(&xt[af].mutex);
+ list_for_each_entry(t, &xt_templates[af], list) {
+ if (strcmp(table->name, t->name))
+ continue;
+
+ list_del(&t->list);
+ mutex_unlock(&xt[af].mutex);
+ kfree(t);
+ return;
+ }
+
+ mutex_unlock(&xt[af].mutex);
+ WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(xt_unregister_template);
+
int xt_proto_init(struct net *net, u_int8_t af)
{
#ifdef CONFIG_PROC_FS
@@ -1744,7 +1830,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
root_uid = make_kuid(net->user_ns, 0);
root_gid = make_kgid(net->user_ns, 0);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops,
sizeof(struct seq_net_private),
@@ -1754,7 +1840,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_match_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1764,7 +1850,7 @@ int xt_proto_init(struct net *net, u_int8_t af)
if (uid_valid(root_uid) && gid_valid(root_gid))
proc_set_user(proc, root_uid, root_gid);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
proc = proc_create_seq_private(buf, 0440, net->proc_net,
&xt_target_seq_ops, sizeof(struct nf_mttg_trav),
@@ -1779,12 +1865,12 @@ int xt_proto_init(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
out_remove_matches:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out_remove_tables:
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
out:
@@ -1798,21 +1884,22 @@ void xt_proto_fini(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
char buf[XT_FUNCTION_MAXNAMELEN];
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
- strlcpy(buf, xt_prefix[af], sizeof(buf));
+ strscpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
remove_proc_entry(buf, net->proc_net);
#endif /*CONFIG_PROC_FS*/
}
EXPORT_SYMBOL_GPL(xt_proto_fini);
+#ifdef CONFIG_NETFILTER_XTABLES_LEGACY
/**
* xt_percpu_counter_alloc - allocate x_tables rule counter
*
@@ -1867,27 +1954,32 @@ void xt_percpu_counter_free(struct xt_counters *counters)
free_percpu((void __percpu *)pcnt);
}
EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+#endif
static int __net_init xt_net_init(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- INIT_LIST_HEAD(&net->xt.tables[i]);
+ INIT_LIST_HEAD(&xt_net->tables[i]);
return 0;
}
static void __net_exit xt_net_exit(struct net *net)
{
+ struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);
int i;
for (i = 0; i < NFPROTO_NUMPROTO; i++)
- WARN_ON_ONCE(!list_empty(&net->xt.tables[i]));
+ WARN_ON_ONCE(!list_empty(&xt_net->tables[i]));
}
static struct pernet_operations xt_net_ops = {
.init = xt_net_init,
.exit = xt_net_exit,
+ .id = &xt_pernet_id,
+ .size = sizeof(struct xt_pernet),
};
static int __init xt_init(void)
@@ -1895,22 +1987,25 @@ static int __init xt_init(void)
unsigned int i;
int rv;
- for_each_possible_cpu(i) {
- seqcount_init(&per_cpu(xt_recseq, i));
+ if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) {
+ for_each_possible_cpu(i) {
+ seqcount_init(&per_cpu(xt_recseq, i));
+ }
}
- xt = kmalloc_array(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL);
+ xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL);
if (!xt)
return -ENOMEM;
for (i = 0; i < NFPROTO_NUMPROTO; i++) {
mutex_init(&xt[i].mutex);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
mutex_init(&xt[i].compat_mutex);
xt[i].compat_tab = NULL;
#endif
INIT_LIST_HEAD(&xt[i].target);
INIT_LIST_HEAD(&xt[i].match);
+ INIT_LIST_HEAD(&xt_templates[i]);
}
rv = register_pernet_subsys(&xt_net_ops);
if (rv < 0)
@@ -1926,4 +2021,3 @@ static void __exit xt_fini(void)
module_init(xt_init);
module_exit(xt_fini);
-
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index af883f1b64f9..b6a015aee0ce 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Creates audit record for dropped/accepted packets
*
* (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
* (C) 2010-2011 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -120,7 +117,7 @@ static int audit_tg_check(const struct xt_tgchk_param *par)
const struct xt_audit_info *info = par->targinfo;
if (info->type > XT_AUDIT_TYPE_MAX) {
- pr_info_ratelimited("Audit type out of range (valid range: 0..%hhu)\n",
+ pr_info_ratelimited("Audit type out of range (valid range: 0..%u)\n",
XT_AUDIT_TYPE_MAX);
return -ERANGE;
}
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index 6c7aa6a0a0d2..9d99f5a3d176 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* iptables module for the packet checksum mangling
*
* (C) 2002 by Harald Welte <laforge@netfilter.org>
* (C) 2010 Red Hat, Inc.
*
* Author: Michael S. Tsirkin <mst@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -66,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par)
return 0;
}
-static struct xt_target checksum_tg_reg __read_mostly = {
- .name = "CHECKSUM",
- .family = NFPROTO_UNSPEC,
- .target = checksum_tg,
- .targetsize = sizeof(struct xt_CHECKSUM_info),
- .table = "mangle",
- .checkentry = checksum_tg_check,
- .me = THIS_MODULE,
+static struct xt_target checksum_tg_reg[] __read_mostly = {
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV4,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CHECKSUM",
+ .family = NFPROTO_IPV6,
+ .target = checksum_tg,
+ .targetsize = sizeof(struct xt_CHECKSUM_info),
+ .table = "mangle",
+ .checkentry = checksum_tg_check,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init checksum_tg_init(void)
{
- return xt_register_target(&checksum_tg_reg);
+ return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
static void __exit checksum_tg_exit(void)
{
- xt_unregister_target(&checksum_tg_reg);
+ xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg));
}
module_init(checksum_tg_init);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index af9c4dadf816..0ae8d8a1216e 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for setting the skb->priority field
* of an skb for qdisc classification.
*/
/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -41,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
{
.name = "CLASSIFY",
.revision = 0,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
- (1 << NF_INET_POST_ROUTING),
+ (1 << NF_INET_POST_ROUTING),
.target = classify_tg,
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
@@ -57,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_classify_target_info),
.me = THIS_MODULE,
},
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CLASSIFY",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_POST_ROUTING),
+ .target = classify_tg,
+ .targetsize = sizeof(struct xt_classify_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init classify_tg_init(void)
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index f3f1caac949b..1494b3ee30e1 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This module is used to copy security markings from packets
* to connections, and restore security markings from connections
@@ -5,15 +6,10 @@
* with the SECMARK target and state match.
*
* Based somewhat on CONNMARK:
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
*
* (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -118,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_target connsecmark_tg_reg __read_mostly = {
- .name = "CONNSECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = connsecmark_tg_check,
- .destroy = connsecmark_tg_destroy,
- .target = connsecmark_tg,
- .targetsize = sizeof(struct xt_connsecmark_target_info),
- .me = THIS_MODULE,
+static struct xt_target connsecmark_tg_reg[] __read_mostly = {
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNSECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = connsecmark_tg_check,
+ .destroy = connsecmark_tg_destroy,
+ .target = connsecmark_tg,
+ .targetsize = sizeof(struct xt_connsecmark_target_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connsecmark_tg_init(void)
{
- return xt_register_target(&connsecmark_tg_reg);
+ return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
static void __exit connsecmark_tg_exit(void)
{
- xt_unregister_target(&connsecmark_tg_reg);
+ xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg));
}
module_init(connsecmark_tg_init);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 2c7a4b80206f..3ba94c34297c 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -27,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
return XT_CONTINUE;
if (ct) {
- atomic_inc(&ct->ct_general.use);
+ refcount_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
} else {
nf_ct_set(skb, ct, IP_CT_UNTRACKED);
@@ -99,89 +96,28 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOMEM;
}
- help->helper = helper;
+ rcu_assign_pointer(help->helper, helper);
return 0;
}
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout)
-{
- typeof(nf_ct_timeout_put_hook) timeout_put;
-
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
- if (timeout_put)
- timeout_put(timeout);
-}
-#endif
-
static int
xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
const char *timeout_name)
{
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
const struct nf_conntrack_l4proto *l4proto;
- struct nf_ct_timeout *timeout;
- struct nf_conn_timeout *timeout_ext;
- const char *errmsg = NULL;
- int ret = 0;
u8 proto;
- rcu_read_lock();
- timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook);
- if (timeout_find_get == NULL) {
- ret = -ENOENT;
- errmsg = "Timeout policy base is empty";
- goto out;
- }
-
proto = xt_ct_find_proto(par);
if (!proto) {
- ret = -EINVAL;
- errmsg = "You must specify a L4 protocol and not use inversions on it";
- goto out;
- }
-
- timeout = timeout_find_get(par->net, timeout_name);
- if (timeout == NULL) {
- ret = -ENOENT;
- pr_info_ratelimited("No such timeout policy \"%s\"\n",
- timeout_name);
- goto out;
- }
-
- if (timeout->l3num != par->family) {
- ret = -EINVAL;
- pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n",
- timeout_name, 3, timeout->l3num);
- goto err_put_timeout;
- }
- /* Make sure the timeout policy matches any existing protocol tracker,
- * otherwise default to generic.
- */
- l4proto = __nf_ct_l4proto_find(proto);
- if (timeout->l4proto->l4proto != l4proto->l4proto) {
- ret = -EINVAL;
- pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n",
- timeout_name, 4, timeout->l4proto->l4proto);
- goto err_put_timeout;
- }
- timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
- if (!timeout_ext) {
- ret = -ENOMEM;
- goto err_put_timeout;
+ pr_info_ratelimited("You must specify a L4 protocol and not "
+ "use inversions on it");
+ return -EINVAL;
}
+ l4proto = nf_ct_l4proto_find(proto);
+ return nf_ct_set_timeout(par->net, ct, par->family, l4proto->l4proto,
+ timeout_name);
- rcu_read_unlock();
- return ret;
-
-err_put_timeout:
- __xt_ct_tg_timeout_put(timeout);
-out:
- rcu_read_unlock();
- if (errmsg)
- pr_info_ratelimited("%s\n", errmsg);
- return ret;
#else
return -EOPNOTSUPP;
#endif
@@ -200,6 +136,21 @@ static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info)
}
}
+static void xt_ct_put_helper(struct nf_conn_help *help)
+{
+ struct nf_conntrack_helper *helper;
+
+ if (!help)
+ return;
+
+ /* not yet exposed to other cpus, or ruleset
+ * already detached (post-replacement).
+ */
+ helper = rcu_dereference_raw(help->helper);
+ if (helper)
+ nf_conntrack_helper_put(helper);
+}
+
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
@@ -236,7 +187,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err2;
}
- ret = 0;
if ((info->ct_events || info->exp_events) &&
!nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events,
GFP_KERNEL)) {
@@ -266,15 +216,13 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err4;
}
__set_bit(IPS_CONFIRMED_BIT, &ct->status);
- nf_conntrack_get(&ct->ct_general);
out:
info->ct = ct;
return 0;
err4:
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -328,26 +276,6 @@ static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
return xt_ct_tg_check(par, par->targinfo);
}
-static void xt_ct_destroy_timeout(struct nf_conn *ct)
-{
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
- struct nf_conn_timeout *timeout_ext;
- typeof(nf_ct_timeout_put_hook) timeout_put;
-
- rcu_read_lock();
- timeout_put = rcu_dereference(nf_ct_timeout_put_hook);
-
- if (timeout_put) {
- timeout_ext = nf_ct_timeout_find(ct);
- if (timeout_ext) {
- timeout_put(timeout_ext->timeout);
- RCU_INIT_POINTER(timeout_ext->timeout, NULL);
- }
- }
- rcu_read_unlock();
-#endif
-}
-
static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
struct xt_ct_target_info_v1 *info)
{
@@ -356,12 +284,11 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
- if (help)
- nf_conntrack_helper_put(help->helper);
+ xt_ct_put_helper(help);
nf_ct_netns_put(par->net, par->family);
- xt_ct_destroy_timeout(ct);
+ nf_ct_destroy_timeout(ct);
nf_ct_put(info->ct);
}
}
@@ -386,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par)
xt_ct_tg_destroy(par, par->targinfo);
}
+static unsigned int
+notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ /* Previously seen (loopback)? Ignore. */
+ if (skb->_nfct != 0)
+ return XT_CONTINUE;
+
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+ return XT_CONTINUE;
+}
+
static struct xt_target xt_ct_tg_reg[] __read_mostly = {
{
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.targetsize = sizeof(struct xt_ct_target_info),
.usersize = offsetof(struct xt_ct_target_info, ct),
.checkentry = xt_ct_tg_check_v0,
@@ -400,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -412,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
},
{
.name = "CT",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 2,
.targetsize = sizeof(struct xt_ct_target_info_v1),
.usersize = offsetof(struct xt_ct_target_info, ct),
@@ -422,60 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = {
.table = "raw",
.me = THIS_MODULE,
},
-};
-
-static unsigned int
-notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- /* Previously seen (loopback)? Ignore. */
- if (skb->_nfct != 0)
- return XT_CONTINUE;
-
- nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-
- return XT_CONTINUE;
-}
-
-static int notrack_chk(const struct xt_tgchk_param *par)
-{
- if (!par->net->xt.notrack_deprecated_warning) {
- pr_info("netfilter: NOTRACK target is deprecated, "
- "use CT instead or upgrade iptables\n");
- par->net->xt.notrack_deprecated_warning = true;
- }
- return 0;
-}
-
-static struct xt_target notrack_tg_reg __read_mostly = {
- .name = "NOTRACK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = notrack_chk,
- .target = notrack_tg,
- .table = "raw",
- .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NOTRACK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = notrack_tg,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .targetsize = sizeof(struct xt_ct_target_info),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v0,
+ .destroy = xt_ct_tg_destroy_v0,
+ .target = xt_ct_target_v0,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v1,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CT",
+ .family = NFPROTO_IPV6,
+ .revision = 2,
+ .targetsize = sizeof(struct xt_ct_target_info_v1),
+ .usersize = offsetof(struct xt_ct_target_info, ct),
+ .checkentry = xt_ct_tg_check_v2,
+ .destroy = xt_ct_tg_destroy_v1,
+ .target = xt_ct_target_v1,
+ .table = "raw",
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_ct_tg_init(void)
{
- int ret;
-
- ret = xt_register_target(&notrack_tg_reg);
- if (ret < 0)
- return ret;
-
- ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- if (ret < 0) {
- xt_unregister_target(&notrack_tg_reg);
- return ret;
- }
- return 0;
+ return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
}
static void __exit xt_ct_tg_exit(void)
{
xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg));
- xt_unregister_target(&notrack_tg_reg);
}
module_init(xt_ct_tg_init);
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 098ed851b7a7..cfa44515ab72 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8
*
* (C) 2002 by Harald Welte <laforge@netfilter.org>
* based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* See RFC2474 for a description of the DSCP field within the IP Header.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -27,6 +24,8 @@ MODULE_ALIAS("ip6t_DSCP");
MODULE_ALIAS("ipt_TOS");
MODULE_ALIAS("ip6t_TOS");
+#define XT_DSCP_ECN_MASK 3u
+
static unsigned int
dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -34,11 +33,10 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
- ipv4_change_dsfield(ip_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv4_change_dsfield(ip_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
@@ -52,11 +50,10 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
if (dscp != dinfo->dscp) {
- if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
- ipv6_change_dsfield(ipv6_hdr(skb),
- (__force __u8)(~XT_DSCP_MASK),
+ ipv6_change_dsfield(ipv6_hdr(skb), XT_DSCP_ECN_MASK,
dinfo->dscp << XT_DSCP_SHIFT);
}
return XT_CONTINUE;
@@ -82,7 +79,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ip_hdr(skb);
ipv4_change_dsfield(iph, 0, nv);
@@ -102,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par)
nv = (orig & ~info->tos_mask) ^ info->tos_value;
if (orig != nv) {
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return NF_DROP;
iph = ipv6_hdr(skb);
ipv6_change_dsfield(iph, 0, nv);
diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c
index 4653b071bed4..7873b834c300 100644
--- a/net/netfilter/xt_HL.c
+++ b/net/netfilter/xt_HL.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TTL modification target for IP tables
* (C) 2000,2005 by Harald Welte <laforge@netfilter.org>
*
* Hop Limit modification target for ip6tables
* Maciej Soltysiak <solt@dns.toxicfilms.tv>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -32,7 +29,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par)
const struct ipt_TTL_info *info = par->targinfo;
int new_ttl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*iph)))
return NF_DROP;
iph = ip_hdr(skb);
@@ -72,7 +69,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par)
const struct ip6t_HL_info *info = par->targinfo;
int new_hl;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, sizeof(*ip6h)))
return NF_DROP;
ip6h = ipv6_hdr(skb);
diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c
index 9c75f419cd80..8928ec56c388 100644
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_HMARK - Netfilter module to set mark by means of hashing
*
* (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com>
* (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -242,11 +239,7 @@ static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff)
return 0;
/* Error message? */
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_SOURCE_QUENCH &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB &&
- icmph->type != ICMP_REDIRECT)
+ if (!icmp_is_err(icmph->type))
return 0;
*nhoff += iphsz + sizeof(_ih);
@@ -283,7 +276,7 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t,
return 0;
/* follow-up fragments don't contain ports, skip all fragments */
- if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ if (ip_is_fragment(ip))
return 0;
hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index eb4cbd244c3d..d73957592c9d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/net/netfilter/xt_IDLETIMER.c
*
@@ -11,26 +12,13 @@
* by Luciano Coelho <luciano.coelho@nokia.com>
*
* Contact: Luciano Coelho <luciano.coelho@nokia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/timer.h>
+#include <linux/alarmtimer.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/netfilter.h>
@@ -41,21 +29,17 @@
#include <linux/workqueue.h>
#include <linux/sysfs.h>
-struct idletimer_tg_attr {
- struct attribute attr;
- ssize_t (*show)(struct kobject *kobj,
- struct attribute *attr, char *buf);
-};
-
struct idletimer_tg {
struct list_head entry;
+ struct alarm alarm;
struct timer_list timer;
struct work_struct work;
struct kobject *kobj;
- struct idletimer_tg_attr attr;
+ struct device_attribute attr;
unsigned int refcnt;
+ u8 timer_type;
};
static LIST_HEAD(idletimer_tg_list);
@@ -76,25 +60,34 @@ struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
return NULL;
}
-static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
- char *buf)
+static ssize_t idletimer_tg_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ struct timespec64 ktimespec = {};
+ long time_diff = 0;
mutex_lock(&list_mutex);
- timer = __idletimer_tg_find_by_label(attr->name);
- if (timer)
- expires = timer->timer.expires;
+ timer = __idletimer_tg_find_by_label(attr->attr.name);
+ if (timer) {
+ if (timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm);
+ ktimespec = ktime_to_timespec64(expires_alarm);
+ time_diff = ktimespec.tv_sec;
+ } else {
+ expires = timer->timer.expires;
+ time_diff = jiffies_to_msecs(expires - jiffies) / 1000;
+ }
+ }
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
- return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
+ return sysfs_emit(buf, "%ld\n", time_diff);
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -107,13 +100,21 @@ static void idletimer_tg_work(struct work_struct *work)
static void idletimer_tg_expired(struct timer_list *t)
{
- struct idletimer_tg *timer = from_timer(timer, t, timer);
+ struct idletimer_tg *timer = timer_container_of(timer, t, timer);
pr_debug("timer %s expired\n", timer->attr.attr.name);
schedule_work(&timer->work);
}
+static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now)
+{
+ struct idletimer_tg *timer = alarm->data;
+
+ pr_debug("alarm %s expired\n", timer->attr.attr.name);
+ schedule_work(&timer->work);
+}
+
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
{
int ret;
@@ -134,7 +135,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
{
int ret;
- info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL);
if (!info->timer) {
ret = -ENOMEM;
goto out;
@@ -167,7 +168,69 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
INIT_WORK(&info->timer->work, idletimer_tg_work);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
+static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+ if (ret < 0)
+ goto out_free_timer;
+
+ sysfs_attr_init(&info->timer->attr.attr);
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = 0444;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ /* notify userspace */
+ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD);
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+ pr_debug("timer type value is %u", info->timer_type);
+ info->timer->timer_type = info->timer_type;
+ info->timer->refcnt = 1;
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout;
+ alarm_init(&info->timer->alarm, ALARM_BOOTTIME,
+ idletimer_tg_alarmproc);
+ info->timer->alarm.data = info->timer;
+ tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
+ mod_timer(&info->timer->timer,
+ secs_to_jiffies(info->timeout) + jiffies);
+ }
return 0;
@@ -191,18 +254,35 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
info->label, info->timeout);
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
return XT_CONTINUE;
}
-static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
{
- struct idletimer_tg_info *info = par->targinfo;
- int ret;
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ mod_timer(&info->timer->timer,
+ secs_to_jiffies(info->timeout) + jiffies);
+ }
+
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_helper(struct idletimer_tg_info *info)
+{
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
return -EINVAL;
@@ -217,14 +297,30 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
pr_debug("label is empty or not nul-terminated\n");
return -EINVAL;
}
+ return 0;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper(info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
mutex_lock(&list_mutex);
info->timer = __idletimer_tg_find_by_label(info->label);
if (info->timer) {
info->timer->refcnt++;
mod_timer(&info->timer->timer,
- msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ secs_to_jiffies(info->timeout) + jiffies);
pr_debug("increased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
@@ -241,6 +337,68 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
return 0;
}
+static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info_v1 *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ if (info->send_nl_msg)
+ return -EOPNOTSUPP;
+
+ ret = idletimer_tg_helper((struct idletimer_tg_info *)info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
+
+ if (info->timer_type > XT_IDLETIMER_ALARM) {
+ pr_debug("invalid value for timer type\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ if (info->timer->timer_type != info->timer_type) {
+ pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n");
+ mutex_unlock(&list_mutex);
+ return -EINVAL;
+ }
+
+ info->timer->refcnt++;
+ if (info->timer_type & XT_IDLETIMER_ALARM) {
+ /* calculate remaining expiry time */
+ ktime_t tout = alarm_expires_remaining(&info->timer->alarm);
+ struct timespec64 ktimespec = ktime_to_timespec64(tout);
+
+ if (ktimespec.tv_sec > 0) {
+ pr_debug("time_expiry_remaining %lld\n",
+ ktimespec.tv_sec);
+ alarm_start_relative(&info->timer->alarm, tout);
+ }
+ } else {
+ mod_timer(&info->timer->timer,
+ secs_to_jiffies(info->timeout) + jiffies);
+ }
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create_v1(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
@@ -249,32 +407,102 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_lock(&list_mutex);
- if (--info->timer->refcnt == 0) {
- pr_debug("deleting timer %s\n", info->label);
+ if (--info->timer->refcnt > 0) {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
+ }
- list_del(&info->timer->entry);
- del_timer_sync(&info->timer->timer);
- cancel_work_sync(&info->timer->work);
- sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
- kfree(info->timer->attr.attr.name);
- kfree(info->timer);
- } else {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ mutex_unlock(&list_mutex);
+
+ timer_shutdown_sync(&info->timer->timer);
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+}
+
+static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt > 0) {
pr_debug("decreased refcnt of timer %s to %u\n",
info->label, info->timer->refcnt);
+ mutex_unlock(&list_mutex);
+ return;
}
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
mutex_unlock(&list_mutex);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ timer_shutdown_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
}
-static struct xt_target idletimer_tg __read_mostly = {
- .name = "IDLETIMER",
- .family = NFPROTO_UNSPEC,
- .target = idletimer_tg_target,
- .targetsize = sizeof(struct idletimer_tg_info),
- .usersize = offsetof(struct idletimer_tg_info, timer),
- .checkentry = idletimer_tg_checkentry,
- .destroy = idletimer_tg_destroy,
- .me = THIS_MODULE,
+
+static struct xt_target idletimer_tg[] __read_mostly = {
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV4,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .target = idletimer_tg_target,
+ .targetsize = sizeof(struct idletimer_tg_info),
+ .usersize = offsetof(struct idletimer_tg_info, timer),
+ .checkentry = idletimer_tg_checkentry,
+ .destroy = idletimer_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct class *idletimer_tg_class;
@@ -285,7 +513,7 @@ static int __init idletimer_tg_init(void)
{
int err;
- idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer");
+ idletimer_tg_class = class_create("xt_idletimer");
err = PTR_ERR(idletimer_tg_class);
if (IS_ERR(idletimer_tg_class)) {
pr_debug("couldn't register device class\n");
@@ -302,7 +530,8 @@ static int __init idletimer_tg_init(void)
idletimer_tg_kobj = &idletimer_tg_device->kobj;
- err = xt_register_target(&idletimer_tg);
+ err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
+
if (err < 0) {
pr_debug("couldn't register xt target\n");
goto out_dev;
@@ -319,7 +548,7 @@ out:
static void __exit idletimer_tg_exit(void)
{
- xt_unregister_target(&idletimer_tg);
+ xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
device_destroy(idletimer_tg_class, MKDEV(0, 0));
class_destroy(idletimer_tg_class);
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index 19846445504d..90dcf088071a 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -1,22 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_LED.c - netfilter target to make LEDs blink upon packet matches
*
* Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301 USA.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -57,7 +43,6 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_led_info *ledinfo = par->targinfo;
struct xt_led_info_internal *ledinternal = ledinfo->internal_data;
- unsigned long led_delay = XT_LED_BLINK_DELAY;
/*
* If "always blink" is enabled, and there's still some time until the
@@ -66,7 +51,7 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
if ((ledinfo->delay > 0) && ledinfo->always_blink &&
timer_pending(&ledinternal->timer))
led_trigger_blink_oneshot(&ledinternal->netfilter_led_trigger,
- &led_delay, &led_delay, 1);
+ XT_LED_BLINK_DELAY, XT_LED_BLINK_DELAY, 1);
else
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL);
@@ -87,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
static void led_timeout_callback(struct timer_list *t)
{
- struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t,
- timer);
+ struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal,
+ t,
+ timer);
led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
}
@@ -111,7 +97,9 @@ static int led_tg_check(const struct xt_tgchk_param *par)
struct xt_led_info_internal *ledinternal;
int err;
- if (ledinfo->id[0] == '\0')
+ /* Bail out if empty string or not a string at all. */
+ if (ledinfo->id[0] == '\0' ||
+ !memchr(ledinfo->id, '\0', sizeof(ledinfo->id)))
return -EINVAL;
mutex_lock(&xt_led_mutex);
@@ -180,7 +168,7 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par)
list_del(&ledinternal->list);
- del_timer_sync(&ledinternal->timer);
+ timer_shutdown_sync(&ledinternal->timer);
led_trigger_unregister(&ledinternal->netfilter_led_trigger);
@@ -190,26 +178,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par)
kfree(ledinternal);
}
-static struct xt_target led_tg_reg __read_mostly = {
- .name = "LED",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = led_tg,
- .targetsize = sizeof(struct xt_led_info),
- .usersize = offsetof(struct xt_led_info, internal_data),
- .checkentry = led_tg_check,
- .destroy = led_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target led_tg_reg[] __read_mostly = {
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "LED",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = led_tg,
+ .targetsize = sizeof(struct xt_led_info),
+ .usersize = offsetof(struct xt_led_info, internal_data),
+ .checkentry = led_tg_check,
+ .destroy = led_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init led_tg_init(void)
{
- return xt_register_target(&led_tg_reg);
+ return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
static void __exit led_tg_exit(void)
{
- xt_unregister_target(&led_tg_reg);
+ xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg));
}
module_init(led_tg_init);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index c3b2017ebe41..f39244f9c0ed 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for logging packets.
*/
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -47,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int log_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
+ int ret;
if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6)
return -EINVAL;
@@ -61,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
- return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nf_log_syslog");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+ }
+
+ return ret;
}
static void log_tg_destroy(const struct xt_tgdtor_param *par)
@@ -111,3 +116,4 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging");
MODULE_ALIAS("ipt_LOG");
MODULE_ALIAS("ip6t_LOG");
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_MASQUERADE.c b/net/netfilter/xt_MASQUERADE.c
new file mode 100644
index 000000000000..eae05c178336
--- /dev/null
+++ b/net/netfilter/xt_MASQUERADE.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Masquerade. Simple mapping which alters range to a local IP address
+ (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_masquerade.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
+
+/* FIXME: Multiple targets. --RR */
+static int masquerade_tg_check(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+
+ if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
+ }
+ if (mr->rangesize != 1) {
+ pr_debug("bad rangesize %u\n", mr->rangesize);
+ return -EINVAL;
+ }
+ return nf_ct_netns_get(par->net, par->family);
+}
+
+static unsigned int
+masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct nf_nat_range2 range;
+ const struct nf_nat_ipv4_multi_range_compat *mr;
+
+ mr = par->targinfo;
+ range.flags = mr->range[0].flags;
+ range.min_proto = mr->range[0].min;
+ range.max_proto = mr->range[0].max;
+
+ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
+ xt_out(par));
+}
+
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_ct_netns_put(par->net, par->family);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static unsigned int
+masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
+}
+
+static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
+{
+ const struct nf_nat_range2 *range = par->targinfo;
+
+ if (range->flags & NF_NAT_RANGE_MAP_IPS)
+ return -EINVAL;
+
+ return nf_ct_netns_get(par->net, par->family);
+}
+#endif
+
+static struct xt_target masquerade_tg_reg[] __read_mostly = {
+ {
+#if IS_ENABLED(CONFIG_IPV6)
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV6,
+ .target = masquerade_tg6,
+ .targetsize = sizeof(struct nf_nat_range),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg6_checkentry,
+ .destroy = masquerade_tg_destroy,
+ .me = THIS_MODULE,
+ }, {
+#endif
+ .name = "MASQUERADE",
+ .family = NFPROTO_IPV4,
+ .target = masquerade_tg,
+ .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
+ .table = "nat",
+ .hooks = 1 << NF_INET_POST_ROUTING,
+ .checkentry = masquerade_tg_check,
+ .destroy = masquerade_tg_destroy,
+ .me = THIS_MODULE,
+ }
+};
+
+static int __init masquerade_tg_init(void)
+{
+ int ret;
+
+ ret = xt_register_targets(masquerade_tg_reg,
+ ARRAY_SIZE(masquerade_tg_reg));
+ if (ret)
+ return ret;
+
+ ret = nf_nat_masquerade_inet_register_notifiers();
+ if (ret) {
+ xt_unregister_targets(masquerade_tg_reg,
+ ARRAY_SIZE(masquerade_tg_reg));
+ return ret;
+ }
+
+ return ret;
+}
+
+static void __exit masquerade_tg_exit(void)
+{
+ xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg));
+ nf_nat_masquerade_inet_unregister_notifiers();
+}
+
+module_init(masquerade_tg_init);
+module_exit(masquerade_tg_exit);
+#if IS_ENABLED(CONFIG_IPV6)
+MODULE_ALIAS("ip6t_MASQUERADE");
+#endif
+MODULE_ALIAS("ipt_MASQUERADE");
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 1d437875e15a..cb2ee80d84fa 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
* Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/ip.h>
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 1ed0cac585c4..6dcf4bc7e30b 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -45,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
static int nflog_tg_check(const struct xt_tgchk_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
+ int ret;
if (info->flags & ~XT_NFLOG_MASK)
return -EINVAL;
if (info->prefix[sizeof(info->prefix) - 1] != '\0')
return -EINVAL;
- return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ if (ret != 0 && !par->nft_compat) {
+ request_module("%s", "nfnetlink_log");
+
+ ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+ }
+
+ return ret;
}
static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
@@ -59,26 +64,41 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
}
-static struct xt_target nflog_tg_reg __read_mostly = {
- .name = "NFLOG",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = nflog_tg_check,
- .destroy = nflog_tg_destroy,
- .target = nflog_tg,
- .targetsize = sizeof(struct xt_nflog_info),
- .me = THIS_MODULE,
+static struct xt_target nflog_tg_reg[] __read_mostly = {
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init nflog_tg_init(void)
{
- return xt_register_target(&nflog_tg_reg);
+ return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
static void __exit nflog_tg_exit(void)
{
- xt_unregister_target(&nflog_tg_reg);
+ xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg));
}
module_init(nflog_tg_init);
module_exit(nflog_tg_exit);
+MODULE_SOFTDEP("pre: nfnetlink_log");
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index a9aca80a32ae..466da23e36ff 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* iptables module for using new netfilter netlink queue
*
* (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 9e05c86ba5c4..4f49cfc27831 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -33,7 +30,7 @@ static unsigned int jhash_rnd __read_mostly;
static unsigned int xt_rateest_hash(const char *name)
{
- return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) &
+ return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) &
(RATEEST_HSIZE - 1);
}
@@ -97,11 +94,11 @@ static unsigned int
xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_rateest_target_info *info = par->targinfo;
- struct gnet_stats_basic_packed *stats = &info->est->bstats;
+ struct gnet_stats_basic_sync *stats = &info->est->bstats;
spin_lock_bh(&info->est->lock);
- stats->bytes += skb->len;
- stats->packets++;
+ u64_stats_add(&stats->bytes, skb->len);
+ u64_stats_inc(&stats->packets);
spin_unlock_bh(&info->est->lock);
return XT_CONTINUE;
@@ -118,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
} cfg;
int ret;
+ if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name))
+ return -ENAMETOOLONG;
+
net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
mutex_lock(&xn->hash_lock);
@@ -143,7 +143,8 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
if (!est)
goto err1;
- strlcpy(est->name, info->name, sizeof(est->name));
+ gnet_stats_basic_sync_init(&est->bstats);
+ strscpy(est->name, info->name, sizeof(est->name));
spin_lock_init(&est->lock);
est->refcnt = 1;
est->params.interval = info->interval;
@@ -178,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
xt_rateest_put(par->net, info->est);
}
-static struct xt_target xt_rateest_tg_reg __read_mostly = {
- .name = "RATEEST",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .target = xt_rateest_tg,
- .checkentry = xt_rateest_tg_checkentry,
- .destroy = xt_rateest_tg_destroy,
- .targetsize = sizeof(struct xt_rateest_target_info),
- .usersize = offsetof(struct xt_rateest_target_info, est),
- .me = THIS_MODULE,
+static struct xt_target xt_rateest_tg_reg[] __read_mostly = {
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "RATEEST",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .target = xt_rateest_tg,
+ .checkentry = xt_rateest_tg_checkentry,
+ .destroy = xt_rateest_tg_destroy,
+ .targetsize = sizeof(struct xt_rateest_target_info),
+ .usersize = offsetof(struct xt_rateest_target_info, est),
+ .me = THIS_MODULE,
+ },
+#endif
};
static __net_init int xt_rateest_net_init(struct net *net)
@@ -213,12 +229,12 @@ static int __init xt_rateest_tg_init(void)
if (err)
return err;
- return xt_register_target(&xt_rateest_tg_reg);
+ return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
}
static void __exit xt_rateest_tg_fini(void)
{
- xt_unregister_target(&xt_rateest_tg_reg);
+ xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg));
unregister_pernet_subsys(&xt_rateest_net_ops);
}
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 5ce9461e979c..ff66b56a3f97 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Based on Rusty Russell's IPv4 REDIRECT target. Development of IPv6
* NAT funded by Astaro.
*/
@@ -49,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-/* FIXME: Take multiple ranges --RR */
static int redirect_tg4_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -68,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
static unsigned int
redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
+ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+ struct nf_nat_range2 range = {
+ .flags = mr->range[0].flags,
+ .min_proto = mr->range[0].min,
+ .max_proto = mr->range[0].max,
+ };
+
+ return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
}
static struct xt_target redirect_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index f16202d26c20..5bc5ea505eb9 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Module for modifying the secmark field of the skb, for use by
* security subsystems.
@@ -6,11 +7,6 @@
* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
*
* (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -25,15 +21,12 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification");
MODULE_ALIAS("ipt_SECMARK");
MODULE_ALIAS("ip6t_SECMARK");
-#define PFX "SECMARK: "
-
static u8 mode;
static unsigned int
-secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info)
{
u32 secmark = 0;
- const struct xt_secmark_target_info *info = par->targinfo;
switch (mode) {
case SECMARK_MODE_SEL:
@@ -47,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static int checkentry_lsm(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info_v1 *info)
{
int err;
@@ -79,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info)
return 0;
}
-static int secmark_tg_check(const struct xt_tgchk_param *par)
+static int
+secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info)
{
- struct xt_secmark_target_info *info = par->targinfo;
int err;
- if (strcmp(par->table, "mangle") != 0 &&
- strcmp(par->table, "security") != 0) {
+ if (strcmp(table, "mangle") != 0 &&
+ strcmp(table, "security") != 0) {
pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n",
- par->table);
+ table);
return -EINVAL;
}
@@ -122,25 +115,99 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
}
}
-static struct xt_target secmark_tg_reg __read_mostly = {
- .name = "SECMARK",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = secmark_tg_check,
- .destroy = secmark_tg_destroy,
- .target = secmark_tg,
- .targetsize = sizeof(struct xt_secmark_target_info),
- .me = THIS_MODULE,
+static int secmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .mode = info->mode,
+ };
+ int ret;
+
+ memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX);
+
+ ret = secmark_tg_check(par->table, &newinfo);
+ info->secid = newinfo.secid;
+
+ return ret;
+}
+
+static unsigned int
+secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_secmark_target_info *info = par->targinfo;
+ struct xt_secmark_target_info_v1 newinfo = {
+ .secid = info->secid,
+ };
+
+ return secmark_tg(skb, &newinfo);
+}
+
+static int secmark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ return secmark_tg_check(par->table, par->targinfo);
+}
+
+static unsigned int
+secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ return secmark_tg(skb, par->targinfo);
+}
+
+static struct xt_target secmark_tg_reg[] __read_mostly = {
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "SECMARK",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v0,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v0,
+ .targetsize = sizeof(struct xt_secmark_target_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "SECMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = secmark_tg_check_v1,
+ .destroy = secmark_tg_destroy,
+ .target = secmark_tg_v1,
+ .targetsize = sizeof(struct xt_secmark_target_info_v1),
+ .usersize = offsetof(struct xt_secmark_target_info_v1, secid),
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init secmark_tg_init(void)
{
- return xt_register_target(&secmark_tg_reg);
+ return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
static void __exit secmark_tg_exit(void)
{
- xt_unregister_target(&secmark_tg_reg);
+ xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg));
}
module_init(secmark_tg_init);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 98efb202f8b4..116a885adb3c 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for setting the MSS option in TCP packets.
*
* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
* Copyright (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -89,7 +86,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (par->fragoff != 0)
return 0;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return -1;
len = skb->len - tcphoff;
@@ -242,8 +239,8 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
oldlen = ipv6h->payload_len;
newlen = htons(ntohs(oldlen) + ret);
if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(csum_sub(skb->csum, oldlen),
- newlen);
+ skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen),
+ (__force __wsum)newlen);
ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index eb92bffff11c..93f064306901 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* A module for stripping a specific TCP option from TCP packets.
*
* Copyright (C) 2007 Sven Schnelle <svens@bitebene.org>
* Copyright © CC Computer Consultants GmbH, 2007
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -31,33 +28,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
static unsigned int
tcpoptstrip_mangle_packet(struct sk_buff *skb,
const struct xt_action_param *par,
- unsigned int tcphoff, unsigned int minlen)
+ unsigned int tcphoff)
{
const struct xt_tcpoptstrip_target_info *info = par->targinfo;
+ struct tcphdr *tcph, _th;
unsigned int optl, i, j;
- struct tcphdr *tcph;
u_int16_t n, o;
u_int8_t *opt;
- int len, tcp_hdrlen;
+ int tcp_hdrlen;
/* This is a fragment, no TCP header is available */
if (par->fragoff != 0)
return XT_CONTINUE;
- if (!skb_make_writable(skb, skb->len))
- return NF_DROP;
-
- len = skb->len - tcphoff;
- if (len < (int)sizeof(struct tcphdr))
+ tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th);
+ if (!tcph)
return NF_DROP;
- tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
tcp_hdrlen = tcph->doff * 4;
+ if (tcp_hdrlen < sizeof(struct tcphdr))
+ return NF_DROP;
- if (len < tcp_hdrlen)
+ if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen))
return NF_DROP;
- opt = (u_int8_t *)tcph;
+ /* must reload tcph, might have been moved */
+ tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
+ opt = (u8 *)tcph;
/*
* Walk through all TCP options - if we find some option to remove,
@@ -91,11 +88,10 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
static unsigned int
tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
- sizeof(struct iphdr) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb));
}
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -109,8 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (tcphoff < 0)
return NF_DROP;
- return tcpoptstrip_mangle_packet(skb, par, tcphoff,
- sizeof(*ipv6h) + sizeof(struct tcphdr));
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff);
}
#endif
@@ -124,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_tcpoptstrip_target_info),
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IP6_NF_MANGLE)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TCPOPTSTRIP",
.family = NFPROTO_IPV6,
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 1dae02a97ee3..a5ebd5640457 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* "TEE" target extension for Xtables
* Copyright © Sebastian Claßen, 2007
@@ -5,10 +6,6 @@
*
* based on ipt_ROUTE.c from Cédric de Launois
* <delaunois@info.ucl.be>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 or later, as published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index ad7420cdc439..e4bea1d346cf 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Transparent proxy support for Linux/iptables
*
* Copyright (c) 2006-2010 BalaBit IT Ltd.
* Author: Balazs Scheidler, Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -78,18 +74,10 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
-
- pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->daddr, ntohs(hp->dest),
- &laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n",
- iph->protocol, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
return NF_DROP;
}
@@ -126,16 +114,12 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
int tproto;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
- if (tproto < 0) {
- pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+ if (tproto < 0)
return NF_DROP;
- }
hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+ if (!hp)
return NF_DROP;
- }
/* check if there's an ongoing connection on the packet
* addresses, this happens if the redirect already happened
@@ -172,19 +156,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
/* This should be in a separate target, but we don't do multiple
targets on the same rule yet */
skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
-
- pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- laddr, ntohs(lport), skb->mark);
-
nf_tproxy_assign_sock(skb, sk);
return NF_ACCEPT;
}
- pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n",
- tproto, &iph->saddr, ntohs(hp->source),
- &iph->daddr, ntohs(hp->dest), skb->mark);
-
return NF_DROP;
}
@@ -204,6 +179,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par)
pr_info_ratelimited("Can be used only with -p tcp or -p udp\n");
return -EINVAL;
}
+
+static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv6_disable(par->net);
+}
#endif
static int tproxy_tg4_check(const struct xt_tgchk_param *par)
@@ -223,6 +203,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par)
return -EINVAL;
}
+static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+ nf_defrag_ipv4_disable(par->net);
+}
+
static struct xt_target tproxy_tg_reg[] __read_mostly = {
{
.name = "TPROXY",
@@ -232,6 +217,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 0,
.targetsize = sizeof(struct xt_tproxy_target_info),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -243,6 +229,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg4_check,
+ .destroy = tproxy_tg4_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
@@ -255,6 +242,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = {
.revision = 1,
.targetsize = sizeof(struct xt_tproxy_target_info_v1),
.checkentry = tproxy_tg6_check,
+ .destroy = tproxy_tg6_destroy,
.hooks = 1 << NF_INET_PRE_ROUTING,
.me = THIS_MODULE,
},
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 858d189a1303..a642ff09fc8e 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* This is a module which is used to mark packets for tracing.
*/
#include <linux/module.h>
@@ -28,26 +29,41 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static struct xt_target trace_tg_reg __read_mostly = {
- .name = "TRACE",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .table = "raw",
- .target = trace_tg,
- .checkentry = trace_tg_check,
- .destroy = trace_tg_destroy,
- .me = THIS_MODULE,
+static struct xt_target trace_tg_reg[] __read_mostly = {
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "TRACE",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .table = "raw",
+ .target = trace_tg,
+ .checkentry = trace_tg_check,
+ .destroy = trace_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init trace_tg_init(void)
{
- return xt_register_target(&trace_tg_reg);
+ return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
static void __exit trace_tg_exit(void)
{
- xt_unregister_target(&trace_tg_reg);
+ xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg));
}
module_init(trace_tg_init);
module_exit(trace_tg_exit);
+MODULE_SOFTDEP("pre: nf_log_syslog");
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 89e281b3bfc2..a77088943107 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* iptables module to match inet_addr_type() of an ip.
*
* Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
* (C) 2007 Laszlo Attila Toth <panther@balabit.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
@@ -36,7 +33,6 @@ MODULE_ALIAS("ip6t_addrtype");
static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
const struct in6_addr *addr, u16 mask)
{
- const struct nf_ipv6_ops *v6ops;
struct flowi6 flow;
struct rt6_info *rt;
u32 ret = 0;
@@ -47,18 +43,13 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
if (dev)
flow.flowi6_oif = dev->ifindex;
- v6ops = nf_get_ipv6_ops();
- if (v6ops) {
- if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
- if (v6ops->chk_addr(net, addr, dev, true))
- ret = XT_ADDRTYPE_LOCAL;
- }
- route_err = v6ops->route(net, (struct dst_entry **)&rt,
- flowi6_to_flowi(&flow), false);
- } else {
- route_err = 1;
+ if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
+ if (nf_ipv6_chk_addr(net, addr, dev, true))
+ ret = XT_ADDRTYPE_LOCAL;
}
+ route_err = nf_ip6_route(net, (struct dst_entry **)&rt,
+ flowi6_to_flowi(&flow), false);
if (route_err)
return XT_ADDRTYPE_UNREACHABLE;
@@ -217,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = {
},
{
.name = "addrtype",
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.revision = 1,
.match = addrtype_mt_v1,
.checkentry = addrtype_mt_checkentry_v1,
.matchsize = sizeof(struct xt_addrtype_info_v1),
.me = THIS_MODULE
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "addrtype",
+ .family = NFPROTO_IPV6,
+ .revision = 1,
+ .match = addrtype_mt_v1,
+ .checkentry = addrtype_mt_checkentry_v1,
+ .matchsize = sizeof(struct xt_addrtype_info_v1),
+ .me = THIS_MODULE
+ },
+#endif
};
static int __init addrtype_mt_init(void)
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index a2cf8a6236d6..849ac552a154 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Xtables module to match packets using a BPF filter.
* Copyright 2013 Google Inc.
* Written by Willem de Bruijn <willemb@google.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -93,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_bpf_info *info = par->matchinfo;
- return BPF_PROG_RUN(info->filter, skb);
+ return bpf_prog_run(info->filter, skb);
}
static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 5cb1ecb29ea4..c437fbd59ec1 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Xtables module to match the process control group.
*
@@ -6,10 +7,6 @@
* Matching is based upon processes tagged to net_cls' classid marker.
*
* (C) 2013 Daniel Borkmann <dborkman@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -26,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching");
MODULE_ALIAS("ipt_cgroup");
MODULE_ALIAS("ip6t_cgroup");
+#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n"
+
static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
{
struct xt_cgroup_info_v0 *info = par->matchinfo;
@@ -33,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
if (info->invert & ~1)
return -EINVAL;
+ if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
return 0;
}
@@ -54,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -86,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
return -EINVAL;
}
+ if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) {
+ pr_info(NET_CLS_CLASSID_INVALID_MSG);
+ return -EINVAL;
+ }
+
info->priv = NULL;
if (info->has_path) {
cgrp = cgroup_get_from_path(info->path);
@@ -103,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par)
static bool
cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
+#ifdef CONFIG_CGROUP_NET_CLASSID
const struct xt_cgroup_info_v0 *info = par->matchinfo;
struct sock *sk = skb->sk;
@@ -111,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
info->invert;
+#endif
+ return false;
}
static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
@@ -126,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
@@ -144,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
if (ancestor)
return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
info->invert_path;
+#ifdef CONFIG_CGROUP_NET_CLASSID
else
return (info->classid == sock_cgroup_classid(skcd)) ^
info->invert_classid;
+#endif
+ return false;
}
static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 51d0c257e7a5..908fd5f2c3c8 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -149,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par)
nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match xt_cluster_match __read_mostly = {
- .name = "cluster",
- .family = NFPROTO_UNSPEC,
- .match = xt_cluster_mt,
- .checkentry = xt_cluster_mt_checkentry,
- .matchsize = sizeof(struct xt_cluster_match_info),
- .destroy = xt_cluster_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match xt_cluster_match[] __read_mostly = {
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV4,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "cluster",
+ .family = NFPROTO_IPV6,
+ .match = xt_cluster_mt,
+ .checkentry = xt_cluster_mt_checkentry,
+ .matchsize = sizeof(struct xt_cluster_match_info),
+ .destroy = xt_cluster_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init xt_cluster_mt_init(void)
{
- return xt_register_match(&xt_cluster_match);
+ return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
static void __exit xt_cluster_mt_fini(void)
{
- xt_unregister_match(&xt_cluster_match);
+ xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match));
}
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c
index 5c861d2f21ca..f095557e3ef6 100644
--- a/net/netfilter/xt_comment.c
+++ b/net/netfilter/xt_comment.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Implements a dummy match to allow attaching comments to rules
*
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 93cb018c3055..2aabdcea8707 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family);
- if (ret < 0)
+ if (ret < 0) {
pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
par->family);
+ return ret;
+ }
/*
* This filter cannot function correctly unless connection tracking
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 4fa4efd24353..87505cdad5f1 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2013 Astaro GmbH & Co KG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -15,7 +12,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
-MODULE_DESCRIPTION("Xtables: add/match connection trackling labels");
+MODULE_DESCRIPTION("Xtables: add/match connection tracking labels");
MODULE_ALIAS("ipt_connlabel");
MODULE_ALIAS("ip6t_connlabel");
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index bc6c8ab0fa62..848287ab79cf 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -13,6 +13,8 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter/x_tables.h>
@@ -29,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = xt_net(par);
const struct xt_connlimit_info *info = par->matchinfo;
- struct nf_conntrack_tuple tuple;
- const struct nf_conntrack_tuple *tuple_ptr = &tuple;
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
@@ -38,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
u32 key[5];
ct = nf_ct_get(skb, &ctinfo);
- if (ct != NULL) {
- tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ if (ct)
zone = nf_ct_zone(ct);
- } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
- xt_family(par), net, &tuple)) {
- goto hotdrop;
- }
if (xt_family(par) == NFPROTO_IPV6) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -60,17 +55,16 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
key[4] = zone->id;
} else {
const struct iphdr *iph = ip_hdr(skb);
- key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
- iph->daddr : iph->saddr;
- key[0] &= info->mask.ip;
+ key[0] = (info->flags & XT_CONNLIMIT_DADDR) ?
+ (__force __u32)iph->daddr : (__force __u32)iph->saddr;
+ key[0] &= (__force __u32)info->mask.ip;
key[1] = zone->id;
}
- connections = nf_conncount_count(net, info->data, key, tuple_ptr,
- zone);
+ connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key);
if (connections == 0)
- /* kmalloc failed, drop it entirely */
+ /* kmalloc failed or tuple couldn't be found, drop it entirely */
goto hotdrop;
return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
@@ -84,6 +78,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
{
struct xt_connlimit_info *info = par->matchinfo;
unsigned int keylen;
+ int ret;
keylen = sizeof(u32);
if (par->family == NFPROTO_IPV6)
@@ -91,8 +86,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
else
keylen += sizeof(struct in_addr);
+ ret = nf_ct_netns_get(par->net, par->family);
+ if (ret < 0) {
+ pr_info_ratelimited("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
+ }
+
/* init private data */
- info->data = nf_conncount_init(par->net, par->family, keylen);
+ info->data = nf_conncount_init(par->net, keylen);
+ if (IS_ERR(info->data))
+ nf_ct_netns_put(par->net, par->family);
return PTR_ERR_OR_ZERO(info->data);
}
@@ -101,29 +105,45 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_connlimit_info *info = par->matchinfo;
- nf_conncount_destroy(par->net, par->family, info->data);
+ nf_conncount_destroy(par->net, info->data);
+ nf_ct_netns_put(par->net, par->family);
}
-static struct xt_match connlimit_mt_reg __read_mostly = {
- .name = "connlimit",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = connlimit_mt_check,
- .match = connlimit_mt,
- .matchsize = sizeof(struct xt_connlimit_info),
- .usersize = offsetof(struct xt_connlimit_info, data),
- .destroy = connlimit_mt_destroy,
- .me = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV4,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "connlimit",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connlimit_mt_check,
+ .match = connlimit_mt,
+ .matchsize = sizeof(struct xt_connlimit_info),
+ .usersize = offsetof(struct xt_connlimit_info, data),
+ .destroy = connlimit_mt_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static int __init connlimit_mt_init(void)
{
- return xt_register_match(&connlimit_mt_reg);
+ return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
static void __exit connlimit_mt_exit(void)
{
- xt_unregister_match(&connlimit_mt_reg);
+ xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
}
module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 29c38aa7f726..4277084de2e7 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -1,23 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* xt_connmark - Netfilter module to operate on connection marks
*
- * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com>
* by Henrik Nordstrom <hno@marasystems.com>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
* Jan Engelhardt <jengelh@medozas.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/module.h>
@@ -42,6 +30,7 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
+ u_int32_t oldmark;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
@@ -49,14 +38,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
switch (info->mode) {
case XT_CONNMARK_SET:
- newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+ oldmark = READ_ONCE(ct->mark);
+ newmark = (oldmark & ~info->ctmask) ^ info->ctmark;
if (info->shift_dir == D_SHIFT_RIGHT)
newmark >>= info->shift_bits;
else
newmark <<= info->shift_bits;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
@@ -67,15 +57,15 @@ connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
else
new_targetmark <<= info->shift_bits;
- newmark = (ct->mark & ~info->ctmask) ^
+ newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^
new_targetmark;
- if (ct->mark != newmark) {
- ct->mark = newmark;
+ if (READ_ONCE(ct->mark) != newmark) {
+ WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
- new_targetmark = (ct->mark & info->ctmask);
+ new_targetmark = (READ_ONCE(ct->mark) & info->ctmask);
if (info->shift_dir == D_SHIFT_RIGHT)
new_targetmark >>= info->shift_bits;
else
@@ -138,7 +128,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL)
return false;
- return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+ return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert;
}
static int connmark_mt_check(const struct xt_mtchk_param *par)
@@ -161,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 1,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg,
.targetsize = sizeof(struct xt_connmark_tginfo1),
@@ -171,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 2,
- .family = NFPROTO_UNSPEC,
+ .family = NFPROTO_IPV4,
.checkentry = connmark_tg_check,
.target = connmark_tg_v2,
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
- }
+ },
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "CONNMARK",
+ .revision = 1,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg,
+ .targetsize = sizeof(struct xt_connmark_tginfo1),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "CONNMARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg_v2,
+ .targetsize = sizeof(struct xt_connmark_tginfo2),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match connmark_mt_reg __read_mostly = {
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index df80fe7d391c..ea299da24734 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_conntrack - Netfilter module to match connection tracking
* information. (Superset of Rusty's minimalistic state match.)
@@ -5,10 +6,6 @@
* (C) 2001 Marc Boucher (marc@mbsi.ca).
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index c7a2e5466bc4..3bdc302a0f91 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -1,17 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match running CPU */
/*
* Might be used to distribute connections on several daemons, if
* RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable,
* each RX queue IRQ affined to one CPU (1:1 mapping)
- *
*/
/* (C) 2010 Eric Dumazet
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index b63d2a3d80ba..e5a13ecbe67a 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* iptables module for DCCP protocol header matching
*
* (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index 96ebe1cdefec..9520dd00070b 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index a4c2b862f820..fb0169a8f9bb 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* IP tables module for matching the value of the IPv4/IPv6 DSCP field
*
* (C) 2002 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c
index c7ad4afa5fb8..b96e8203ac54 100644
--- a/net/netfilter/xt_ecn.c
+++ b/net/netfilter/xt_ecn.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits
*
* (C) 2002 by Harald Welte <laforge@gnumonks.org>
* (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 171ba82b5902..2a1c0ad0ff07 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match ESP parameters. */
/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 8d86e39d6280..3b507694e81e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_hashlimit - Netfilter module to limit the number of packets per time
* separately for each hashbucket (sourceip/sourceport/dstip/dstport)
@@ -14,7 +15,6 @@
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/slab.h>
-#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/list.h>
@@ -33,9 +33,15 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_hashlimit.h>
#include <linux/mutex.h>
#include <linux/kernel.h>
+#include <linux/refcount.h>
+#include <uapi/linux/netfilter/xt_hashlimit.h>
+
+#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
+ XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \
+ XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\
+ XT_HASHLIMIT_RATE_MATCH)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
@@ -108,7 +114,7 @@ struct dsthash_ent {
struct xt_hashlimit_htable {
struct hlist_node node; /* global list of all htables */
- int use;
+ refcount_t use;
u_int8_t family;
bool rnd_initialized;
@@ -125,7 +131,7 @@ struct xt_hashlimit_htable {
const char *name;
struct net *net;
- struct hlist_head hash[0]; /* hashtable itself */
+ struct hlist_head hash[]; /* hashtable itself */
};
static int
@@ -287,9 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
if (size < 16)
size = 16;
}
- /* FIXME: don't use vmalloc() here or anywhere else -HW */
- hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) +
- sizeof(struct hlist_head) * size);
+ hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL);
if (hinfo == NULL)
return -ENOMEM;
*out_hinfo = hinfo;
@@ -297,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
/* copy match config into hashtable config */
ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
if (ret) {
- vfree(hinfo);
+ kvfree(hinfo);
return ret;
}
@@ -310,13 +314,13 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
for (i = 0; i < hinfo->cfg.size; i++)
INIT_HLIST_HEAD(&hinfo->hash[i]);
- hinfo->use = 1;
+ refcount_set(&hinfo->use, 1);
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
hinfo->name = kstrdup(name, GFP_KERNEL);
if (!hinfo->name) {
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
spin_lock_init(&hinfo->lock);
@@ -338,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
ops, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
- vfree(hinfo);
+ kvfree(hinfo);
return -ENOMEM;
}
hinfo->net = net;
@@ -352,31 +356,21 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
return 0;
}
-static bool select_all(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he)
-{
- return true;
-}
-
-static bool select_gc(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he)
-{
- return time_after_eq(jiffies, he->expires);
-}
-
-static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
- bool (*select)(const struct xt_hashlimit_htable *ht,
- const struct dsthash_ent *he))
+static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all)
{
unsigned int i;
for (i = 0; i < ht->cfg.size; i++) {
+ struct hlist_head *head = &ht->hash[i];
struct dsthash_ent *dh;
struct hlist_node *n;
+ if (hlist_empty(head))
+ continue;
+
spin_lock_bh(&ht->lock);
- hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
- if ((*select)(ht, dh))
+ hlist_for_each_entry_safe(dh, n, head, node) {
+ if (time_after_eq(jiffies, dh->expires) || select_all)
dsthash_free(ht, dh);
}
spin_unlock_bh(&ht->lock);
@@ -390,7 +384,7 @@ static void htable_gc(struct work_struct *work)
ht = container_of(work, struct xt_hashlimit_htable, gc_work.work);
- htable_selective_cleanup(ht, select_gc);
+ htable_selective_cleanup(ht, false);
queue_delayed_work(system_power_efficient_wq,
&ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval));
@@ -410,15 +404,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
remove_proc_entry(hinfo->name, parent);
}
-static void htable_destroy(struct xt_hashlimit_htable *hinfo)
-{
- cancel_delayed_work_sync(&hinfo->gc_work);
- htable_remove_proc_entry(hinfo);
- htable_selective_cleanup(hinfo, select_all);
- kfree(hinfo->name);
- vfree(hinfo);
-}
-
static struct xt_hashlimit_htable *htable_find_get(struct net *net,
const char *name,
u_int8_t family)
@@ -429,7 +414,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
if (!strcmp(name, hinfo->name) &&
hinfo->family == family) {
- hinfo->use++;
+ refcount_inc(&hinfo->use);
return hinfo;
}
}
@@ -438,12 +423,16 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
static void htable_put(struct xt_hashlimit_htable *hinfo)
{
- mutex_lock(&hashlimit_mutex);
- if (--hinfo->use == 0) {
+ if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) {
hlist_del(&hinfo->node);
- htable_destroy(hinfo);
+ htable_remove_proc_entry(hinfo);
+ mutex_unlock(&hashlimit_mutex);
+
+ cancel_delayed_work_sync(&hinfo->gc_work);
+ htable_selective_cleanup(hinfo, true);
+ kfree(hinfo->name);
+ kvfree(hinfo);
}
- mutex_unlock(&hashlimit_mutex);
}
/* The algorithm used is the Simple Token Bucket Filter (TBF)
@@ -846,6 +835,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
}
+#define HASHLIMIT_MAX_SIZE 1048576
+
static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
struct xt_hashlimit_htable **hinfo,
struct hashlimit_cfg3 *cfg,
@@ -856,6 +847,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
if (cfg->gc_interval == 0 || cfg->expire == 0)
return -EINVAL;
+ if (cfg->size > HASHLIMIT_MAX_SIZE) {
+ cfg->size = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("size too large, truncated to %u\n", cfg->size);
+ }
+ if (cfg->max > HASHLIMIT_MAX_SIZE) {
+ cfg->max = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("max too large, truncated to %u\n", cfg->max);
+ }
if (par->family == NFPROTO_IPV4) {
if (cfg->srcmask > 32 || cfg->dstmask > 32)
return -EINVAL;
@@ -1055,7 +1054,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
__acquires(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket;
spin_lock_bh(&htable->lock);
@@ -1072,7 +1071,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos)
static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
*pos = ++(*bucket);
@@ -1086,7 +1085,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void dl_seq_stop(struct seq_file *s, void *v)
__releases(htable->lock)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
if (!IS_ERR(bucket))
@@ -1128,7 +1127,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1143,7 +1142,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1158,7 +1157,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
- struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file));
spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
@@ -1172,7 +1171,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
static int dl_seq_show_v2(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = (unsigned int *)v;
struct dsthash_ent *ent;
@@ -1186,7 +1185,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v)
static int dl_seq_show_v1(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
@@ -1200,7 +1199,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v)
static int dl_seq_show(struct seq_file *s, void *v)
{
- struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file));
+ struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file));
unsigned int *bucket = v;
struct dsthash_ent *ent;
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index fd077aeaaed9..a5a167f941e0 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* iptables module to match on related connections */
/*
* (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c
index 003951149c9e..c1a70f8f0441 100644
--- a/net/netfilter/xt_hl.c
+++ b/net/netfilter/xt_hl.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IP tables module for matching the value of the TTL
* (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
*
* Hop Limit matching module
* (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/ip.h>
diff --git a/net/netfilter/xt_ipcomp.c b/net/netfilter/xt_ipcomp.c
index 57f1df575701..472da639a32e 100644
--- a/net/netfilter/xt_ipcomp.c
+++ b/net/netfilter/xt_ipcomp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Kernel module to match IPComp parameters for IPv4 and IPv6
*
* Copyright (C) 2013 WindRiver
@@ -7,11 +8,6 @@
*
* Based on:
* net/netfilter/xt_esp.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index b46626cddd93..0c9e014e30b4 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_iprange - Netfilter module to match IP address ranges
*
- * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
* (C) CC Computer Consultants GmbH, 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -133,7 +130,7 @@ static void __exit iprange_mt_exit(void)
module_init(iprange_mt_init);
module_exit(iprange_mt_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching");
MODULE_ALIAS("ipt_iprange");
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 1d950a6100af..253c71cc9a63 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_ipvs - kernel module to match IPVS connection properties
*
diff --git a/net/netfilter/xt_l2tp.c b/net/netfilter/xt_l2tp.c
index c43482bf48e6..a61eb81e9f49 100644
--- a/net/netfilter/xt_l2tp.c
+++ b/net/netfilter/xt_l2tp.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match L2TP header parameters. */
/* (C) 2013 James Chapman <jchapman@katalix.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
index 176e5570a999..ca730cedb5d4 100644
--- a/net/netfilter/xt_length.c
+++ b/net/netfilter/xt_length.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match packet length. */
/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -24,7 +21,7 @@ static bool
length_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
+ u32 pktlen = skb_ip_totlen(skb);
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
@@ -33,8 +30,7 @@ static bool
length_mt6(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_length_info *info = par->matchinfo;
- const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) +
- sizeof(struct ipv6hdr);
+ u32 pktlen = skb->len;
return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
}
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 9f098ecb2449..8b4fd27857f2 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -1,26 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
* (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
-#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_limit.h>
struct xt_limit_priv {
- spinlock_t lock;
unsigned long prev;
- uint32_t credit;
+ u32 credit;
};
MODULE_LICENSE("GPL");
@@ -69,22 +64,31 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateinfo *r = par->matchinfo;
struct xt_limit_priv *priv = r->master;
- unsigned long now = jiffies;
-
- spin_lock_bh(&priv->lock);
- priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
- if (priv->credit > r->credit_cap)
- priv->credit = r->credit_cap;
-
- if (priv->credit >= r->cost) {
- /* We're not limited. */
- priv->credit -= r->cost;
- spin_unlock_bh(&priv->lock);
- return true;
- }
-
- spin_unlock_bh(&priv->lock);
- return false;
+ unsigned long now;
+ u32 old_credit, new_credit, credit_increase = 0;
+ bool ret;
+
+ /* fastpath if there is nothing to update */
+ if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies))
+ return false;
+
+ do {
+ now = jiffies;
+ credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+ old_credit = READ_ONCE(priv->credit);
+ new_credit = old_credit;
+ new_credit += credit_increase;
+ if (new_credit > r->credit_cap)
+ new_credit = r->credit_cap;
+ if (new_credit >= r->cost) {
+ ret = true;
+ new_credit -= r->cost;
+ } else {
+ ret = false;
+ }
+ } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit);
+
+ return ret;
}
/* Precision saver. */
@@ -125,7 +129,6 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
r->credit_cap = priv->credit; /* Credits full. */
r->cost = user2credits(r->avg);
}
- spin_lock_init(&priv->lock);
return 0;
}
@@ -137,7 +140,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par)
kfree(info->master);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_xt_rateinfo {
u_int32_t avg;
u_int32_t burst;
@@ -179,7 +182,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src)
};
return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
}
-#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */
static struct xt_match limit_mt_reg __read_mostly = {
.name = "limit",
@@ -189,7 +192,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
.checkentry = limit_mt_check,
.destroy = limit_mt_destroy,
.matchsize = sizeof(struct xt_rateinfo),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct compat_xt_rateinfo),
.compat_from_user = limit_mt_compat_from_user,
.compat_to_user = limit_mt_compat_to_user,
diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c
index d5b4fd4f91ed..81649da57ba5 100644
--- a/net/netfilter/xt_mac.c
+++ b/net/netfilter/xt_mac.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match MAC address parameters. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index ebd41dc501e5..59b9d04400ca 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_mark - Netfilter module to match NFMARK value
*
* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
* Jan Engelhardt <jengelh@medozas.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -42,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
-static struct xt_target mark_tg_reg __read_mostly = {
- .name = "MARK",
- .revision = 2,
- .family = NFPROTO_UNSPEC,
- .target = mark_tg,
- .targetsize = sizeof(struct xt_mark_tginfo2),
- .me = THIS_MODULE,
+static struct xt_target mark_tg_reg[] __read_mostly = {
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_ARP,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV6,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
};
static struct xt_match mark_mt_reg __read_mostly = {
@@ -64,12 +83,12 @@ static int __init mark_mt_init(void)
{
int ret;
- ret = xt_register_target(&mark_tg_reg);
+ ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
if (ret < 0)
return ret;
ret = xt_register_match(&mark_mt_reg);
if (ret < 0) {
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
return ret;
}
return 0;
@@ -78,7 +97,7 @@ static int __init mark_mt_init(void)
static void __exit mark_mt_exit(void)
{
xt_unregister_match(&mark_mt_reg);
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
}
module_init(mark_mt_init);
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index 1cde0e4985b7..44a00f5acde8 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports:
ports are in the same place so we can treat them as equal. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index ac91170fc8c8..b4f7bbc3f3ca 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -14,7 +11,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat.h>
static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
{
@@ -247,3 +244,4 @@ MODULE_ALIAS("ipt_SNAT");
MODULE_ALIAS("ipt_DNAT");
MODULE_ALIAS("ip6t_SNAT");
MODULE_ALIAS("ip6t_DNAT");
+MODULE_DESCRIPTION("SNAT and DNAT targets support");
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 6b56f4170860..0ca1cdfc4095 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org>
- * (C) 2011 Intra2net AG <http://www.intra2net.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 (or any
- * later at your option) as published by the Free Software Foundation.
+ * (C) 2011 Intra2net AG <https://www.intra2net.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,7 +27,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
overquota = nfnl_acct_overquota(xt_net(par), info->nfacct);
- return overquota == NFACCT_UNDERQUOTA ? false : true;
+ return overquota != NFACCT_UNDERQUOTA;
}
static int
@@ -41,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
nfacct = nfnl_acct_find_get(par->net, info->name);
if (nfacct == NULL) {
- pr_info_ratelimited("accounting object `%s' does not exists\n",
- info->name);
+ pr_info_ratelimited("accounting object `%.*s' does not exist\n",
+ NFACCT_NAME_MAX, info->name);
return -ENOENT;
}
info->nfacct = nfacct;
@@ -57,25 +54,39 @@ nfacct_mt_destroy(const struct xt_mtdtor_param *par)
nfnl_acct_put(info->nfacct);
}
-static struct xt_match nfacct_mt_reg __read_mostly = {
- .name = "nfacct",
- .family = NFPROTO_UNSPEC,
- .checkentry = nfacct_mt_checkentry,
- .match = nfacct_mt,
- .destroy = nfacct_mt_destroy,
- .matchsize = sizeof(struct xt_nfacct_match_info),
- .usersize = offsetof(struct xt_nfacct_match_info, nfacct),
- .me = THIS_MODULE,
+static struct xt_match nfacct_mt_reg[] __read_mostly = {
+ {
+ .name = "nfacct",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info),
+ .usersize = offsetof(struct xt_nfacct_match_info, nfacct),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "nfacct",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfacct_mt_checkentry,
+ .match = nfacct_mt,
+ .destroy = nfacct_mt_destroy,
+ .matchsize = sizeof(struct xt_nfacct_match_info_v1),
+ .usersize = offsetof(struct xt_nfacct_match_info_v1, nfacct),
+ .me = THIS_MODULE,
+ },
};
static int __init nfacct_mt_init(void)
{
- return xt_register_match(&nfacct_mt_reg);
+ return xt_register_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
}
static void __exit nfacct_mt_exit(void)
{
- xt_unregister_match(&nfacct_mt_reg);
+ xt_unregister_matches(nfacct_mt_reg, ARRAY_SIZE(nfacct_mt_reg));
}
module_init(nfacct_mt_init);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 7a103553d10d..dc9485854002 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -84,4 +71,3 @@ MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
MODULE_DESCRIPTION("Passive OS fingerprint matching.");
MODULE_ALIAS("ipt_osf");
MODULE_ALIAS("ip6t_osf");
-MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 46686fb73784..50332888c8d2 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel module to match various things tied to sockets associated with
* locally generated outgoing packets.
@@ -5,10 +6,6 @@
* (C) 2000 Marc Boucher <marc@mbsi.ca>
*
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -25,6 +22,9 @@ static int owner_check(const struct xt_mtchk_param *par)
struct xt_owner_match_info *info = par->matchinfo;
struct net *net = par->net;
+ if (info->match & ~XT_OWNER_MASK)
+ return -EINVAL;
+
/* Only allow the common case where the userns of the writer
* matches the userns of the network namespace.
*/
@@ -76,29 +76,54 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = sk->sk_socket->file;
- if (filp == NULL)
+ read_lock_bh(&sk->sk_callback_lock);
+ filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+ if (filp == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
+ }
if (info->match & XT_OWNER_UID) {
kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
uid_lte(filp->f_cred->fsuid, uid_max)) ^
- !(info->invert & XT_OWNER_UID))
+ !(info->invert & XT_OWNER_UID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
if (info->match & XT_OWNER_GID) {
+ unsigned int i, match = false;
kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
- if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
- gid_lte(filp->f_cred->fsgid, gid_max)) ^
- !(info->invert & XT_OWNER_GID))
+ struct group_info *gi = filp->f_cred->group_info;
+
+ if (gid_gte(filp->f_cred->fsgid, gid_min) &&
+ gid_lte(filp->f_cred->fsgid, gid_max))
+ match = true;
+
+ if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) {
+ for (i = 0; i < gi->ngroups; ++i) {
+ kgid_t group = gi->gid[i];
+
+ if (gid_gte(group, gid_min) &&
+ gid_lte(group, gid_max)) {
+ match = true;
+ break;
+ }
+ }
+ }
+
+ if (match ^ !(info->invert & XT_OWNER_GID)) {
+ read_unlock_bh(&sk->sk_callback_lock);
return false;
+ }
}
+ read_unlock_bh(&sk->sk_callback_lock);
return true;
}
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index 4034d70bff39..343e65f377d4 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -1,19 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match the bridge port in and
* out device for IP packets coming into contact with a bridge. */
/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/if.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netfilter_bridge.h>
-#include <linux/netfilter/xt_physdev.h>
#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/br_netfilter.h>
+#include <uapi/linux/netfilter/xt_physdev.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
@@ -61,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
(!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
return false;
- physdev = nf_bridge_get_physindev(skb);
+ physdev = nf_bridge_get_physindev(skb, xt_net(par));
indev = physdev ? physdev->name : NULL;
if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
@@ -96,8 +94,7 @@ match_outdev:
static int physdev_mt_check(const struct xt_mtchk_param *par)
{
const struct xt_physdev_info *info = par->matchinfo;
-
- br_netfilter_enable();
+ static bool brnf_probed __read_mostly;
if (!(info->bitmask & XT_PHYSDEV_OP_MASK) ||
info->bitmask & ~XT_PHYSDEV_OP_MASK)
@@ -105,12 +102,16 @@ static int physdev_mt_check(const struct xt_mtchk_param *par)
if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) &&
(!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) ||
info->invert & XT_PHYSDEV_OP_BRIDGED) &&
- par->hook_mask & ((1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) {
+ par->hook_mask & (1 << NF_INET_LOCAL_OUT)) {
pr_info_ratelimited("--physdev-out and --physdev-is-out only supported in the FORWARD and POSTROUTING chains with bridged traffic\n");
- if (par->hook_mask & (1 << NF_INET_LOCAL_OUT))
- return -EINVAL;
+ return -EINVAL;
}
+
+ if (!brnf_probed) {
+ brnf_probed = true;
+ request_module("br_netfilter");
+ }
+
return 0;
}
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 1ef99151b3ba..f48946aef49f 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index aa84e8121c93..cb6e8279010a 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* IP tables module for matching IPsec policy
*
* Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index 10d61a6eed71..4452cc93b990 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* netfilter module to enforce network quotas
*
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index bf77326861af..72324bd976af 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* (C) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/skbuff.h>
diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c
index 459a7b256eb2..6df485f4403d 100644
--- a/net/netfilter/xt_realm.c
+++ b/net/netfilter/xt_realm.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* IP tables module for matching the routing realm
*
* (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index f44de4bc2100..588a5e6ad899 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* This is a replacement of the old ipt_recent module, which carried the
* following copyright notice:
*
@@ -62,9 +59,9 @@ MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* fil
/* retained for backwards compatibility */
static unsigned int ip_pkt_list_tot __read_mostly;
module_param(ip_pkt_list_tot, uint, 0400);
-MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)");
+MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 65535)");
-#define XT_RECENT_MAX_NSTAMPS 256
+#define XT_RECENT_MAX_NSTAMPS 65536
struct recent_entry {
struct list_head list;
@@ -72,9 +69,9 @@ struct recent_entry {
union nf_inet_addr addr;
u_int16_t family;
u_int8_t ttl;
- u_int8_t index;
+ u_int16_t index;
u_int16_t nstamps;
- unsigned long stamps[0];
+ unsigned long stamps[];
};
struct recent_table {
@@ -83,9 +80,9 @@ struct recent_table {
union nf_inet_addr mask;
unsigned int refcnt;
unsigned int entries;
- u8 nstamps_max_mask;
+ u_int16_t nstamps_max_mask;
struct list_head lru_list;
- struct list_head iphash[0];
+ struct list_head iphash[];
};
struct recent_net {
@@ -106,7 +103,7 @@ static DEFINE_SPINLOCK(recent_lock);
static DEFINE_MUTEX(recent_mutex);
#ifdef CONFIG_PROC_FS
-static const struct file_operations recent_mt_fops;
+static const struct proc_ops recent_mt_proc_ops;
#endif
static u_int32_t hash_rnd __read_mostly;
@@ -155,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
/*
* Drop entries with timestamps older then 'time'.
*/
-static void recent_entry_reap(struct recent_table *t, unsigned long time)
+static void recent_entry_reap(struct recent_table *t, unsigned long time,
+ struct recent_entry *working, bool update)
{
struct recent_entry *e;
@@ -165,6 +163,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time)
e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
/*
+ * Do not reap the entry which are going to be updated.
+ */
+ if (e == working && update)
+ return;
+
+ /*
* The last time stamp is the most recent.
*/
if (time_after(time, e->stamps[e->index-1]))
@@ -306,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
/* info->seconds must be non-zero */
if (info->check_set & XT_RECENT_REAP)
- recent_entry_reap(t, time);
+ recent_entry_reap(t, time, e,
+ info->check_set & XT_RECENT_UPDATE && ret);
}
if (info->check_set & XT_RECENT_SET ||
@@ -337,7 +342,6 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
unsigned int nstamp_mask;
unsigned int i;
int ret = -EINVAL;
- size_t sz;
net_get_random_once(&hash_rnd, sizeof(hash_rnd));
@@ -387,8 +391,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
goto out;
}
- sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
- t = kvzalloc(sz, GFP_KERNEL);
+ t = kvzalloc(struct_size(t, iphash, ip_list_hash_size), GFP_KERNEL);
if (t == NULL) {
ret = -ENOMEM;
goto out;
@@ -410,7 +413,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
goto out;
}
pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
- &recent_mt_fops, t);
+ &recent_mt_proc_ops, t);
if (pde == NULL) {
recent_table_free(t);
ret = -ENOMEM;
@@ -497,12 +500,12 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
const struct recent_entry *e = v;
const struct list_head *head = e->list.next;
+ (*pos)++;
while (head == &t->iphash[st->bucket]) {
if (++st->bucket >= ip_list_hash_size)
return NULL;
head = t->iphash[st->bucket].next;
}
- (*pos)++;
return list_entry(head, struct recent_entry, list);
}
@@ -548,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file)
if (st == NULL)
return -ENOMEM;
- st->table = PDE_DATA(inode);
+ st->table = pde_data(inode);
return 0;
}
@@ -556,9 +559,9 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- struct recent_table *t = PDE_DATA(file_inode(file));
+ struct recent_table *t = pde_data(file_inode(file));
struct recent_entry *e;
- char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+ char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")];
const char *c = buf;
union nf_inet_addr addr = {};
u_int16_t family;
@@ -621,13 +624,12 @@ recent_mt_proc_write(struct file *file, const char __user *input,
return size + 1;
}
-static const struct file_operations recent_mt_fops = {
- .open = recent_seq_open,
- .read = seq_read,
- .write = recent_mt_proc_write,
- .release = seq_release_private,
- .owner = THIS_MODULE,
- .llseek = seq_lseek,
+static const struct proc_ops recent_mt_proc_ops = {
+ .proc_open = recent_seq_open,
+ .proc_read = seq_read,
+ .proc_write = recent_mt_proc_write,
+ .proc_release = seq_release_private,
+ .proc_lseek = seq_lseek,
};
static int __net_init recent_proc_net_init(struct net *net)
@@ -646,7 +648,7 @@ static void __net_exit recent_proc_net_exit(struct net *net)
struct recent_table *t;
/* recent_net_exit() is called before recent_mt_destroy(). Make sure
- * that the parent xt_recent proc entry is is empty before trying to
+ * that the parent xt_recent proc entry is empty before trying to
* remove it.
*/
spin_lock_bh(&recent_lock);
diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h
index 68ccbe50bb1e..600060ca940a 100644
--- a/net/netfilter/xt_repldata.h
+++ b/net/netfilter/xt_repldata.h
@@ -29,7 +29,7 @@
if (tbl == NULL) \
return NULL; \
term = (struct type##_error *)&(((char *)tbl)[term_offset]); \
- strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \
+ strscpy(tbl->repl.name, info->name); \
*term = (struct type##_error)typ2##_ERROR_INIT; \
tbl->repl.valid_hooks = hook_mask; \
tbl->repl.num_entries = nhooks + 1; \
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 2d2fa1d53ea6..b46a6a512058 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -1,9 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
-#include <net/sctp/sctp.h>
#include <linux/sctp.h>
#include <linux/netfilter/x_tables.h>
@@ -149,6 +149,8 @@ static int sctp_mt_check(const struct xt_mtchk_param *par)
{
const struct xt_sctp_info *info = par->matchinfo;
+ if (info->flag_count > ARRAY_SIZE(info->flag_info))
+ return -EINVAL;
if (info->flags & ~XT_SCTP_VALID_FLAGS)
return -EINVAL;
if (info->invflags & ~XT_SCTP_VALID_FLAGS)
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index bf2890b13212..731bc2cafae4 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
/* Kernel module which implements the set match and SET target
@@ -17,11 +14,10 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <uapi/linux/netfilter/xt_set.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
MODULE_DESCRIPTION("Xtables: IP set match and target module");
MODULE_ALIAS("xt_SET");
MODULE_ALIAS("ipt_set");
@@ -439,6 +435,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
{
const struct xt_set_info_target_v3 *info = par->targinfo;
ip_set_id_t index;
+ int ret = 0;
if (info->add_set.index != IPSET_INVALID_ID) {
index = ip_set_nfnl_get_byindex(par->net,
@@ -456,17 +453,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find del_set index %u as target\n",
info->del_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_add;
}
}
if (info->map_set.index != IPSET_INVALID_ID) {
if (strncmp(par->table, "mangle", 7)) {
pr_info_ratelimited("--map-set only usable from mangle table\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) |
(info->flags & IPSET_FLAG_MAP_SKBQUEUE)) &&
@@ -474,20 +470,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
1 << NF_INET_LOCAL_OUT |
1 << NF_INET_POST_ROUTING))) {
pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto cleanup_del;
}
index = ip_set_nfnl_get_byindex(par->net,
info->map_set.index);
if (index == IPSET_INVALID_ID) {
pr_info_ratelimited("Cannot find map_set index %u as target\n",
info->map_set.index);
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net,
- info->del_set.index);
- return -ENOENT;
+ ret = -ENOENT;
+ goto cleanup_del;
}
}
@@ -495,16 +487,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
info->del_set.dim > IPSET_DIM_MAX ||
info->map_set.dim > IPSET_DIM_MAX) {
pr_info_ratelimited("SET target dimension over the limit!\n");
- if (info->add_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->add_set.index);
- if (info->del_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->del_set.index);
- if (info->map_set.index != IPSET_INVALID_ID)
- ip_set_nfnl_put(par->net, info->map_set.index);
- return -ERANGE;
+ ret = -ERANGE;
+ goto cleanup_mark;
}
return 0;
+cleanup_mark:
+ if (info->map_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->map_set.index);
+cleanup_del:
+ if (info->del_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->del_set.index);
+cleanup_add:
+ if (info->add_set.index != IPSET_INVALID_ID)
+ ip_set_nfnl_put(par->net, info->add_set.index);
+ return ret;
}
static void
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index ada144e5645b..76e01f292aaf 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Transparent proxy support for Linux/iptables
*
* Copyright (C) 2007-2008 BalaBit IT Ltd.
* Author: Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -81,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -142,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
transparent && sk_fullsock(sk))
- pskb->mark = sk->sk_mark;
+ pskb->mark = READ_ONCE(sk->sk_mark);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -220,6 +216,16 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par)
return 0;
}
+static void socket_mt_destroy(const struct xt_mtdtor_param *par)
+{
+ if (par->family == NFPROTO_IPV4)
+ nf_defrag_ipv4_disable(par->net);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ else if (par->family == NFPROTO_IPV6)
+ nf_defrag_ipv6_disable(par->net);
+#endif
+}
+
static struct xt_match socket_mt_reg[] __read_mostly = {
{
.name = "socket",
@@ -235,6 +241,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.revision = 1,
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
+ .destroy = socket_mt_destroy,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -249,6 +256,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v1_check,
.matchsize = sizeof(struct xt_socket_mtinfo1),
+ .destroy = socket_mt_destroy,
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
@@ -260,6 +268,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -272,6 +281,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v2_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -284,6 +294,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV4,
.match = socket_mt4_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
@@ -296,6 +307,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
.family = NFPROTO_IPV6,
.match = socket_mt6_v1_v2_v3,
.checkentry = socket_mt_v3_check,
+ .destroy = socket_mt_destroy,
.matchsize = sizeof(struct xt_socket_mtinfo1),
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN),
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 0b41c0befe3c..bbe07b1be9a3 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match connection tracking information. */
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 8710fdba2ae2..b26c1dcfc27b 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>.
*/
@@ -37,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par)
switch (info->mode) {
case XT_STATISTIC_MODE_RANDOM:
- if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability)
+ if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability)
ret = !ret;
break;
case XT_STATISTIC_MODE_NTH:
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index be1feddadcf0..8ce25bc9b277 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* String matching match for iptables
*
* (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/gfp.h>
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index c53d4d18eadf..37704ab01799 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match TCP MSS values. */
/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
* Portions (C) 2005 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index ade024c90f4f..e8991130a3de 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -1,8 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/module.h>
#include <net/ip.h>
#include <linux/ipv6.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/tcp.h>
#include <net/udp.h>
@@ -19,6 +21,8 @@ MODULE_ALIAS("ipt_udp");
MODULE_ALIAS("ipt_tcp");
MODULE_ALIAS("ip6t_udp");
MODULE_ALIAS("ip6t_tcp");
+MODULE_ALIAS("ipt_icmp");
+MODULE_ALIAS("ip6t_icmp6");
/* Returns 1 if the port is matched by the range, 0 otherwise */
static inline bool
@@ -160,6 +164,95 @@ static int udp_mt_check(const struct xt_mtchk_param *par)
return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0;
}
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code)
+{
+ return type == test_type && code >= min_code && code <= max_code;
+}
+
+static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return (test_type == 0xFF ||
+ type_code_in_range(test_type, min_code, max_code, type, code))
+ ^ invert;
+}
+
+static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code,
+ u8 type, u8 code, bool invert)
+{
+ return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert;
+}
+
+static bool
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmphdr *ic;
+ struct icmphdr _icmph;
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->type, ic->code,
+ !!(icmpinfo->invflags & IPT_ICMP_INV));
+}
+
+static bool
+icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct icmp6hdr *ic;
+ struct icmp6hdr _icmph;
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ /* Must not be a fragment. */
+ if (par->fragoff != 0)
+ return false;
+
+ ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
+ if (!ic) {
+ /* We've been asked to examine this packet, and we
+ * can't. Hence, no choice but to drop.
+ */
+ par->hotdrop = true;
+ return false;
+ }
+
+ return icmp6_type_code_match(icmpinfo->type,
+ icmpinfo->code[0],
+ icmpinfo->code[1],
+ ic->icmp6_type, ic->icmp6_code,
+ !!(icmpinfo->invflags & IP6T_ICMP_INV));
+}
+
+static int icmp_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ipt_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+}
+
+static int icmp6_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_icmp *icmpinfo = par->matchinfo;
+
+ return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+}
+
static struct xt_match tcpudp_mt_reg[] __read_mostly = {
{
.name = "tcp",
@@ -215,6 +308,24 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = {
.proto = IPPROTO_UDPLITE,
.me = THIS_MODULE,
},
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "icmp6",
+ .match = icmp6_match,
+ .matchsize = sizeof(struct ip6t_icmp),
+ .checkentry = icmp6_checkentry,
+ .proto = IPPROTO_ICMPV6,
+ .family = NFPROTO_IPV6,
+ .me = THIS_MODULE,
+ },
};
static int __init tcpudp_mt_init(void)
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index c13bcd0ab491..6aa12d0f54e2 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -5,7 +5,7 @@
* based on ipt_time by Fabrice MARIE <fabrice@netfilter.org>
* This is a module which is used for time matching
* It is using some modified code from dietlibc (localtime() function)
- * that you can find at http://www.fefe.de/dietlibc/
+ * that you can find at https://www.fefe.de/dietlibc/
* This file is distributed under the terms of the GNU General Public
* License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
*/
@@ -77,12 +77,12 @@ static inline bool is_leap(unsigned int y)
* This is done in three separate functions so that the most expensive
* calculations are done last, in case a "simple match" can be found earlier.
*/
-static inline unsigned int localtime_1(struct xtm *r, time_t time)
+static inline unsigned int localtime_1(struct xtm *r, time64_t time)
{
unsigned int v, w;
/* Each day has 86400s, so finding the hour/minute is actually easy. */
- v = time % SECONDS_PER_DAY;
+ div_u64_rem(time, SECONDS_PER_DAY, &v);
r->second = v % 60;
w = v / 60;
r->minute = w % 60;
@@ -90,13 +90,13 @@ static inline unsigned int localtime_1(struct xtm *r, time_t time)
return v;
}
-static inline void localtime_2(struct xtm *r, time_t time)
+static inline void localtime_2(struct xtm *r, time64_t time)
{
/*
* Here comes the rest (weekday, monthday). First, divide the SSTE
* by seconds-per-day to get the number of _days_ since the epoch.
*/
- r->dse = time / 86400;
+ r->dse = div_u64(time, SECONDS_PER_DAY);
/*
* 1970-01-01 (w=0) was a Thursday (4).
@@ -105,7 +105,7 @@ static inline void localtime_2(struct xtm *r, time_t time)
r->weekday = (4 + r->dse - 1) % 7 + 1;
}
-static void localtime_3(struct xtm *r, time_t time)
+static void localtime_3(struct xtm *r, time64_t time)
{
unsigned int year, i, w = r->dse;
@@ -160,22 +160,27 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
const struct xt_time_info *info = par->matchinfo;
unsigned int packet_time;
struct xtm current_time;
- s64 stamp;
+ time64_t stamp;
/*
- * We cannot use get_seconds() instead of __net_timestamp() here.
+ * We need real time here, but we can neither use skb->tstamp
+ * nor __net_timestamp().
+ *
+ * skb->tstamp and skb->skb_mstamp_ns overlap, however, they
+ * use different clock types (real vs monotonic).
+ *
* Suppose you have two rules:
- * 1. match before 13:00
- * 2. match after 13:00
- * If you match against processing time (get_seconds) it
+ * 1. match before 13:00
+ * 2. match after 13:00
+ *
+ * If you match against processing time (ktime_get_real_seconds) it
* may happen that the same packet matches both rules if
- * it arrived at the right moment before 13:00.
+ * it arrived at the right moment before 13:00, so it would be
+ * better to check skb->tstamp and set it via __net_timestamp()
+ * if needed. This however breaks outgoing packets tx timestamp,
+ * and causes them to get delayed forever by fq packet scheduler.
*/
- if (skb->tstamp == 0)
- __net_timestamp((struct sk_buff *)skb);
-
- stamp = ktime_to_ns(skb->tstamp);
- stamp = div_s64(stamp, NSEC_PER_SEC);
+ stamp = ktime_get_real_seconds();
if (info->flags & XT_TIME_LOCAL_TZ)
/* Adjust for local timezone */
@@ -188,6 +193,9 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
* - 'now' is in the weekday mask
* - 'now' is in the daytime range time_start..time_end
* (and by default, libxt_time will set these so as to match)
+ *
+ * note: info->date_start/stop are unsigned 32-bit values that
+ * can hold values beyond y2038, but not after y2106.
*/
if (stamp < info->date_start || stamp > info->date_stop)
diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index a95b50342dbb..117d4615d668 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* xt_u32 - kernel module to match u32 packet content
*
@@ -95,11 +96,32 @@ static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par)
return ret ^ data->invert;
}
+static int u32_mt_checkentry(const struct xt_mtchk_param *par)
+{
+ const struct xt_u32 *data = par->matchinfo;
+ const struct xt_u32_test *ct;
+ unsigned int i;
+
+ if (data->ntests > ARRAY_SIZE(data->tests))
+ return -EINVAL;
+
+ for (i = 0; i < data->ntests; ++i) {
+ ct = &data->tests[i];
+
+ if (ct->nnums > ARRAY_SIZE(ct->location) ||
+ ct->nvalues > ARRAY_SIZE(ct->value))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct xt_match xt_u32_mt_reg __read_mostly = {
.name = "u32",
.revision = 0,
.family = NFPROTO_UNSPEC,
.match = u32_mt,
+ .checkentry = u32_mt_checkentry,
.matchsize = sizeof(struct xt_u32),
.me = THIS_MODULE,
};